Bug Summary

File:llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 36216, column 5
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-12/lib/clang/12.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/build-llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/build-llvm/include -I /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-12/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/build-llvm/lib/Target/X86 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-12-11-210320-5824-1 -x c++ /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp

/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp

1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/EHPersonalities.h"
31#include "llvm/Analysis/ProfileSummaryInfo.h"
32#include "llvm/Analysis/VectorUtils.h"
33#include "llvm/CodeGen/IntrinsicLowering.h"
34#include "llvm/CodeGen/MachineFrameInfo.h"
35#include "llvm/CodeGen/MachineFunction.h"
36#include "llvm/CodeGen/MachineInstrBuilder.h"
37#include "llvm/CodeGen/MachineJumpTableInfo.h"
38#include "llvm/CodeGen/MachineModuleInfo.h"
39#include "llvm/CodeGen/MachineRegisterInfo.h"
40#include "llvm/CodeGen/TargetLowering.h"
41#include "llvm/CodeGen/WinEHFuncInfo.h"
42#include "llvm/IR/CallingConv.h"
43#include "llvm/IR/Constants.h"
44#include "llvm/IR/DerivedTypes.h"
45#include "llvm/IR/DiagnosticInfo.h"
46#include "llvm/IR/Function.h"
47#include "llvm/IR/GlobalAlias.h"
48#include "llvm/IR/GlobalVariable.h"
49#include "llvm/IR/Instructions.h"
50#include "llvm/IR/Intrinsics.h"
51#include "llvm/MC/MCAsmInfo.h"
52#include "llvm/MC/MCContext.h"
53#include "llvm/MC/MCExpr.h"
54#include "llvm/MC/MCSymbol.h"
55#include "llvm/Support/CommandLine.h"
56#include "llvm/Support/Debug.h"
57#include "llvm/Support/ErrorHandling.h"
58#include "llvm/Support/KnownBits.h"
59#include "llvm/Support/MathExtras.h"
60#include "llvm/Target/TargetOptions.h"
61#include <algorithm>
62#include <bitset>
63#include <cctype>
64#include <numeric>
65using namespace llvm;
66
67#define DEBUG_TYPE"x86-isel" "x86-isel"
68
69STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
70
71static cl::opt<int> ExperimentalPrefLoopAlignment(
72 "x86-experimental-pref-loop-alignment", cl::init(4),
73 cl::desc(
74 "Sets the preferable loop alignment for experiments (as log2 bytes)"
75 "(the last x86-experimental-pref-loop-alignment bits"
76 " of the loop header PC will be 0)."),
77 cl::Hidden);
78
79static cl::opt<bool> MulConstantOptimization(
80 "mul-constant-optimization", cl::init(true),
81 cl::desc("Replace 'mul x, Const' with more effective instructions like "
82 "SHIFT, LEA, etc."),
83 cl::Hidden);
84
85static cl::opt<bool> ExperimentalUnorderedISEL(
86 "x86-experimental-unordered-atomic-isel", cl::init(false),
87 cl::desc("Use LoadSDNode and StoreSDNode instead of "
88 "AtomicSDNode for unordered atomic loads and "
89 "stores respectively."),
90 cl::Hidden);
91
92/// Call this when the user attempts to do something unsupported, like
93/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
94/// report_fatal_error, so calling code should attempt to recover without
95/// crashing.
96static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
97 const char *Msg) {
98 MachineFunction &MF = DAG.getMachineFunction();
99 DAG.getContext()->diagnose(
100 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
101}
102
103X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
104 const X86Subtarget &STI)
105 : TargetLowering(TM), Subtarget(STI) {
106 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
107 X86ScalarSSEf64 = Subtarget.hasSSE2();
108 X86ScalarSSEf32 = Subtarget.hasSSE1();
109 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
110
111 // Set up the TargetLowering object.
112
113 // X86 is weird. It always uses i8 for shift amounts and setcc results.
114 setBooleanContents(ZeroOrOneBooleanContent);
115 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
116 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
117
118 // For 64-bit, since we have so many registers, use the ILP scheduler.
119 // For 32-bit, use the register pressure specific scheduling.
120 // For Atom, always use ILP scheduling.
121 if (Subtarget.isAtom())
122 setSchedulingPreference(Sched::ILP);
123 else if (Subtarget.is64Bit())
124 setSchedulingPreference(Sched::ILP);
125 else
126 setSchedulingPreference(Sched::RegPressure);
127 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
128 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
129
130 // Bypass expensive divides and use cheaper ones.
131 if (TM.getOptLevel() >= CodeGenOpt::Default) {
132 if (Subtarget.hasSlowDivide32())
133 addBypassSlowDiv(32, 8);
134 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
135 addBypassSlowDiv(64, 32);
136 }
137
138 // Setup Windows compiler runtime calls.
139 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
140 static const struct {
141 const RTLIB::Libcall Op;
142 const char * const Name;
143 const CallingConv::ID CC;
144 } LibraryCalls[] = {
145 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
146 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
147 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
148 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
149 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
150 };
151
152 for (const auto &LC : LibraryCalls) {
153 setLibcallName(LC.Op, LC.Name);
154 setLibcallCallingConv(LC.Op, LC.CC);
155 }
156 }
157
158 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
159 // MSVCRT doesn't have powi; fall back to pow
160 setLibcallName(RTLIB::POWI_F32, nullptr);
161 setLibcallName(RTLIB::POWI_F64, nullptr);
162 }
163
164 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
165 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
166 // FIXME: Should we be limiting the atomic size on other configs? Default is
167 // 1024.
168 if (!Subtarget.hasCmpxchg8b())
169 setMaxAtomicSizeInBitsSupported(32);
170
171 // Set up the register classes.
172 addRegisterClass(MVT::i8, &X86::GR8RegClass);
173 addRegisterClass(MVT::i16, &X86::GR16RegClass);
174 addRegisterClass(MVT::i32, &X86::GR32RegClass);
175 if (Subtarget.is64Bit())
176 addRegisterClass(MVT::i64, &X86::GR64RegClass);
177
178 for (MVT VT : MVT::integer_valuetypes())
179 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
180
181 // We don't accept any truncstore of integer registers.
182 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
183 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
184 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
185 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
186 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
187 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
188
189 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
190
191 // SETOEQ and SETUNE require checking two conditions.
192 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
193 setCondCodeAction(ISD::SETOEQ, VT, Expand);
194 setCondCodeAction(ISD::SETUNE, VT, Expand);
195 }
196
197 // Integer absolute.
198 if (Subtarget.hasCMov()) {
199 setOperationAction(ISD::ABS , MVT::i16 , Custom);
200 setOperationAction(ISD::ABS , MVT::i32 , Custom);
201 if (Subtarget.is64Bit())
202 setOperationAction(ISD::ABS , MVT::i64 , Custom);
203 }
204
205 // Funnel shifts.
206 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
207 // For slow shld targets we only lower for code size.
208 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
209
210 setOperationAction(ShiftOp , MVT::i8 , Custom);
211 setOperationAction(ShiftOp , MVT::i16 , Custom);
212 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
213 if (Subtarget.is64Bit())
214 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
215 }
216
217 if (!Subtarget.useSoftFloat()) {
218 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
219 // operation.
220 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
221 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
222 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
223 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
224 // We have an algorithm for SSE2, and we turn this into a 64-bit
225 // FILD or VCVTUSI2SS/SD for other targets.
226 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
227 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
228 // We have an algorithm for SSE2->double, and we turn this into a
229 // 64-bit FILD followed by conditional FADD for other targets.
230 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
231 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
232
233 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
234 // this operation.
235 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
236 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
237 // SSE has no i16 to fp conversion, only i32. We promote in the handler
238 // to allow f80 to use i16 and f64 to use i16 with sse1 only
239 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
240 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
241 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
242 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
243 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
244 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
245 // are Legal, f80 is custom lowered.
246 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
247 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
248
249 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
250 // this operation.
251 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
252 // FIXME: This doesn't generate invalid exception when it should. PR44019.
253 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
254 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
255 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
256 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
257 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
258 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
259 // are Legal, f80 is custom lowered.
260 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
261 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
262
263 // Handle FP_TO_UINT by promoting the destination to a larger signed
264 // conversion.
265 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
266 // FIXME: This doesn't generate invalid exception when it should. PR44019.
267 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
268 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
269 // FIXME: This doesn't generate invalid exception when it should. PR44019.
270 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
271 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
272 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
273 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
274 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
275
276 setOperationAction(ISD::LRINT, MVT::f32, Custom);
277 setOperationAction(ISD::LRINT, MVT::f64, Custom);
278 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
279 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
280
281 if (!Subtarget.is64Bit()) {
282 setOperationAction(ISD::LRINT, MVT::i64, Custom);
283 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
284 }
285 }
286
287 // Handle address space casts between mixed sized pointers.
288 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
289 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
290
291 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
292 if (!X86ScalarSSEf64) {
293 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
294 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
295 if (Subtarget.is64Bit()) {
296 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
297 // Without SSE, i64->f64 goes through memory.
298 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
299 }
300 } else if (!Subtarget.is64Bit())
301 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
302
303 // Scalar integer divide and remainder are lowered to use operations that
304 // produce two results, to match the available instructions. This exposes
305 // the two-result form to trivial CSE, which is able to combine x/y and x%y
306 // into a single instruction.
307 //
308 // Scalar integer multiply-high is also lowered to use two-result
309 // operations, to match the available instructions. However, plain multiply
310 // (low) operations are left as Legal, as there are single-result
311 // instructions for this in x86. Using the two-result multiply instructions
312 // when both high and low results are needed must be arranged by dagcombine.
313 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
314 setOperationAction(ISD::MULHS, VT, Expand);
315 setOperationAction(ISD::MULHU, VT, Expand);
316 setOperationAction(ISD::SDIV, VT, Expand);
317 setOperationAction(ISD::UDIV, VT, Expand);
318 setOperationAction(ISD::SREM, VT, Expand);
319 setOperationAction(ISD::UREM, VT, Expand);
320 }
321
322 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
323 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
324 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
325 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
326 setOperationAction(ISD::BR_CC, VT, Expand);
327 setOperationAction(ISD::SELECT_CC, VT, Expand);
328 }
329 if (Subtarget.is64Bit())
330 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
331 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
332 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
333 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
334
335 setOperationAction(ISD::FREM , MVT::f32 , Expand);
336 setOperationAction(ISD::FREM , MVT::f64 , Expand);
337 setOperationAction(ISD::FREM , MVT::f80 , Expand);
338 setOperationAction(ISD::FREM , MVT::f128 , Expand);
339 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
340
341 // Promote the i8 variants and force them on up to i32 which has a shorter
342 // encoding.
343 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
344 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
345 if (!Subtarget.hasBMI()) {
346 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
347 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
348 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
349 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
350 if (Subtarget.is64Bit()) {
351 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
352 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
353 }
354 }
355
356 if (Subtarget.hasLZCNT()) {
357 // When promoting the i8 variants, force them to i32 for a shorter
358 // encoding.
359 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
360 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
361 } else {
362 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
363 if (VT == MVT::i64 && !Subtarget.is64Bit())
364 continue;
365 setOperationAction(ISD::CTLZ , VT, Custom);
366 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
367 }
368 }
369
370 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
371 ISD::STRICT_FP_TO_FP16}) {
372 // Special handling for half-precision floating point conversions.
373 // If we don't have F16C support, then lower half float conversions
374 // into library calls.
375 setOperationAction(
376 Op, MVT::f32,
377 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
378 // There's never any support for operations beyond MVT::f32.
379 setOperationAction(Op, MVT::f64, Expand);
380 setOperationAction(Op, MVT::f80, Expand);
381 setOperationAction(Op, MVT::f128, Expand);
382 }
383
384 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
385 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
386 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
387 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
388 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
389 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
390 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
391 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
392
393 setOperationAction(ISD::PARITY, MVT::i8, Custom);
394 if (Subtarget.hasPOPCNT()) {
395 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
396 } else {
397 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
398 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
399 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
400 if (Subtarget.is64Bit())
401 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
402 else
403 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
404
405 setOperationAction(ISD::PARITY, MVT::i16, Custom);
406 setOperationAction(ISD::PARITY, MVT::i32, Custom);
407 if (Subtarget.is64Bit())
408 setOperationAction(ISD::PARITY, MVT::i64, Custom);
409 }
410
411 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
412
413 if (!Subtarget.hasMOVBE())
414 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
415
416 // X86 wants to expand cmov itself.
417 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
418 setOperationAction(ISD::SELECT, VT, Custom);
419 setOperationAction(ISD::SETCC, VT, Custom);
420 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
421 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
422 }
423 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
424 if (VT == MVT::i64 && !Subtarget.is64Bit())
425 continue;
426 setOperationAction(ISD::SELECT, VT, Custom);
427 setOperationAction(ISD::SETCC, VT, Custom);
428 }
429
430 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
431 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
432 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
433
434 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
435 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
436 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
437 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
438 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
439 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
440 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
441 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
442
443 // Darwin ABI issue.
444 for (auto VT : { MVT::i32, MVT::i64 }) {
445 if (VT == MVT::i64 && !Subtarget.is64Bit())
446 continue;
447 setOperationAction(ISD::ConstantPool , VT, Custom);
448 setOperationAction(ISD::JumpTable , VT, Custom);
449 setOperationAction(ISD::GlobalAddress , VT, Custom);
450 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
451 setOperationAction(ISD::ExternalSymbol , VT, Custom);
452 setOperationAction(ISD::BlockAddress , VT, Custom);
453 }
454
455 // 64-bit shl, sra, srl (iff 32-bit x86)
456 for (auto VT : { MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
458 continue;
459 setOperationAction(ISD::SHL_PARTS, VT, Custom);
460 setOperationAction(ISD::SRA_PARTS, VT, Custom);
461 setOperationAction(ISD::SRL_PARTS, VT, Custom);
462 }
463
464 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
465 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
466
467 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
468
469 // Expand certain atomics
470 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
471 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
472 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
473 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
477 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
478 }
479
480 if (!Subtarget.is64Bit())
481 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
482
483 if (Subtarget.hasCmpxchg16b()) {
484 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
485 }
486
487 // FIXME - use subtarget debug flags
488 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
489 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
490 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
491 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
492 }
493
494 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
495 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
496
497 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
498 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
499
500 setOperationAction(ISD::TRAP, MVT::Other, Legal);
501 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
502 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
503
504 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
505 setOperationAction(ISD::VASTART , MVT::Other, Custom);
506 setOperationAction(ISD::VAEND , MVT::Other, Expand);
507 bool Is64Bit = Subtarget.is64Bit();
508 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
509 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
510
511 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
512 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
513
514 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
515
516 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
517 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
518 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
519
520 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
521 // f32 and f64 use SSE.
522 // Set up the FP register classes.
523 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
524 : &X86::FR32RegClass);
525 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
526 : &X86::FR64RegClass);
527
528 // Disable f32->f64 extload as we can only generate this in one instruction
529 // under optsize. So its easier to pattern match (fpext (load)) for that
530 // case instead of needing to emit 2 instructions for extload in the
531 // non-optsize case.
532 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
533
534 for (auto VT : { MVT::f32, MVT::f64 }) {
535 // Use ANDPD to simulate FABS.
536 setOperationAction(ISD::FABS, VT, Custom);
537
538 // Use XORP to simulate FNEG.
539 setOperationAction(ISD::FNEG, VT, Custom);
540
541 // Use ANDPD and ORPD to simulate FCOPYSIGN.
542 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
543
544 // These might be better off as horizontal vector ops.
545 setOperationAction(ISD::FADD, VT, Custom);
546 setOperationAction(ISD::FSUB, VT, Custom);
547
548 // We don't support sin/cos/fmod
549 setOperationAction(ISD::FSIN , VT, Expand);
550 setOperationAction(ISD::FCOS , VT, Expand);
551 setOperationAction(ISD::FSINCOS, VT, Expand);
552 }
553
554 // Lower this to MOVMSK plus an AND.
555 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
556 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
557
558 } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
559 (UseX87 || Is64Bit)) {
560 // Use SSE for f32, x87 for f64.
561 // Set up the FP register classes.
562 addRegisterClass(MVT::f32, &X86::FR32RegClass);
563 if (UseX87)
564 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
565
566 // Use ANDPS to simulate FABS.
567 setOperationAction(ISD::FABS , MVT::f32, Custom);
568
569 // Use XORP to simulate FNEG.
570 setOperationAction(ISD::FNEG , MVT::f32, Custom);
571
572 if (UseX87)
573 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
574
575 // Use ANDPS and ORPS to simulate FCOPYSIGN.
576 if (UseX87)
577 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
578 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
579
580 // We don't support sin/cos/fmod
581 setOperationAction(ISD::FSIN , MVT::f32, Expand);
582 setOperationAction(ISD::FCOS , MVT::f32, Expand);
583 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
584
585 if (UseX87) {
586 // Always expand sin/cos functions even though x87 has an instruction.
587 setOperationAction(ISD::FSIN, MVT::f64, Expand);
588 setOperationAction(ISD::FCOS, MVT::f64, Expand);
589 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
590 }
591 } else if (UseX87) {
592 // f32 and f64 in x87.
593 // Set up the FP register classes.
594 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
595 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
596
597 for (auto VT : { MVT::f32, MVT::f64 }) {
598 setOperationAction(ISD::UNDEF, VT, Expand);
599 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
600
601 // Always expand sin/cos functions even though x87 has an instruction.
602 setOperationAction(ISD::FSIN , VT, Expand);
603 setOperationAction(ISD::FCOS , VT, Expand);
604 setOperationAction(ISD::FSINCOS, VT, Expand);
605 }
606 }
607
608 // Expand FP32 immediates into loads from the stack, save special cases.
609 if (isTypeLegal(MVT::f32)) {
610 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
611 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
612 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
613 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
614 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
615 } else // SSE immediates.
616 addLegalFPImmediate(APFloat(+0.0f)); // xorps
617 }
618 // Expand FP64 immediates into loads from the stack, save special cases.
619 if (isTypeLegal(MVT::f64)) {
620 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
621 addLegalFPImmediate(APFloat(+0.0)); // FLD0
622 addLegalFPImmediate(APFloat(+1.0)); // FLD1
623 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
624 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
625 } else // SSE immediates.
626 addLegalFPImmediate(APFloat(+0.0)); // xorpd
627 }
628 // Handle constrained floating-point operations of scalar.
629 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
630 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
631 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
632 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
633 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
634 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
635 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
636 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
637 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
638 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
639 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
640 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
641 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
642
643 // We don't support FMA.
644 setOperationAction(ISD::FMA, MVT::f64, Expand);
645 setOperationAction(ISD::FMA, MVT::f32, Expand);
646
647 // f80 always uses X87.
648 if (UseX87) {
649 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
650 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
651 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
652 {
653 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
654 addLegalFPImmediate(TmpFlt); // FLD0
655 TmpFlt.changeSign();
656 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
657
658 bool ignored;
659 APFloat TmpFlt2(+1.0);
660 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
661 &ignored);
662 addLegalFPImmediate(TmpFlt2); // FLD1
663 TmpFlt2.changeSign();
664 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
665 }
666
667 // Always expand sin/cos functions even though x87 has an instruction.
668 setOperationAction(ISD::FSIN , MVT::f80, Expand);
669 setOperationAction(ISD::FCOS , MVT::f80, Expand);
670 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
671
672 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
673 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
674 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
675 setOperationAction(ISD::FRINT, MVT::f80, Expand);
676 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
677 setOperationAction(ISD::FMA, MVT::f80, Expand);
678 setOperationAction(ISD::LROUND, MVT::f80, Expand);
679 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
680 setOperationAction(ISD::LRINT, MVT::f80, Custom);
681 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
682
683 // Handle constrained floating-point operations of scalar.
684 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
685 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
686 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
687 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
688 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
689 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
690 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
691 // as Custom.
692 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
693 }
694
695 // f128 uses xmm registers, but most operations require libcalls.
696 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
697 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
698 : &X86::VR128RegClass);
699
700 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
701
702 setOperationAction(ISD::FADD, MVT::f128, LibCall);
703 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
704 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
705 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
706 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
707 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
708 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
709 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
710 setOperationAction(ISD::FMA, MVT::f128, LibCall);
711 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
712
713 setOperationAction(ISD::FABS, MVT::f128, Custom);
714 setOperationAction(ISD::FNEG, MVT::f128, Custom);
715 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
716
717 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
718 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
719 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
720 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
721 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
722 // No STRICT_FSINCOS
723 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
724 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
725
726 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
727 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
728 // We need to custom handle any FP_ROUND with an f128 input, but
729 // LegalizeDAG uses the result type to know when to run a custom handler.
730 // So we have to list all legal floating point result types here.
731 if (isTypeLegal(MVT::f32)) {
732 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
733 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
734 }
735 if (isTypeLegal(MVT::f64)) {
736 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
737 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
738 }
739 if (isTypeLegal(MVT::f80)) {
740 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
741 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
742 }
743
744 setOperationAction(ISD::SETCC, MVT::f128, Custom);
745
746 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
747 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
748 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
749 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
750 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
751 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
752 }
753
754 // Always use a library call for pow.
755 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
756 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
757 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
758 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
759
760 setOperationAction(ISD::FLOG, MVT::f80, Expand);
761 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
762 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
763 setOperationAction(ISD::FEXP, MVT::f80, Expand);
764 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
765 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
766 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
767
768 // Some FP actions are always expanded for vector types.
769 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
770 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
771 setOperationAction(ISD::FSIN, VT, Expand);
772 setOperationAction(ISD::FSINCOS, VT, Expand);
773 setOperationAction(ISD::FCOS, VT, Expand);
774 setOperationAction(ISD::FREM, VT, Expand);
775 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
776 setOperationAction(ISD::FPOW, VT, Expand);
777 setOperationAction(ISD::FLOG, VT, Expand);
778 setOperationAction(ISD::FLOG2, VT, Expand);
779 setOperationAction(ISD::FLOG10, VT, Expand);
780 setOperationAction(ISD::FEXP, VT, Expand);
781 setOperationAction(ISD::FEXP2, VT, Expand);
782 }
783
784 // First set operation action for all vector types to either promote
785 // (for widening) or expand (for scalarization). Then we will selectively
786 // turn on ones that can be effectively codegen'd.
787 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
788 setOperationAction(ISD::SDIV, VT, Expand);
789 setOperationAction(ISD::UDIV, VT, Expand);
790 setOperationAction(ISD::SREM, VT, Expand);
791 setOperationAction(ISD::UREM, VT, Expand);
792 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
793 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
794 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
795 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
796 setOperationAction(ISD::FMA, VT, Expand);
797 setOperationAction(ISD::FFLOOR, VT, Expand);
798 setOperationAction(ISD::FCEIL, VT, Expand);
799 setOperationAction(ISD::FTRUNC, VT, Expand);
800 setOperationAction(ISD::FRINT, VT, Expand);
801 setOperationAction(ISD::FNEARBYINT, VT, Expand);
802 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
803 setOperationAction(ISD::MULHS, VT, Expand);
804 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
805 setOperationAction(ISD::MULHU, VT, Expand);
806 setOperationAction(ISD::SDIVREM, VT, Expand);
807 setOperationAction(ISD::UDIVREM, VT, Expand);
808 setOperationAction(ISD::CTPOP, VT, Expand);
809 setOperationAction(ISD::CTTZ, VT, Expand);
810 setOperationAction(ISD::CTLZ, VT, Expand);
811 setOperationAction(ISD::ROTL, VT, Expand);
812 setOperationAction(ISD::ROTR, VT, Expand);
813 setOperationAction(ISD::BSWAP, VT, Expand);
814 setOperationAction(ISD::SETCC, VT, Expand);
815 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
816 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
817 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
818 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
819 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
820 setOperationAction(ISD::TRUNCATE, VT, Expand);
821 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
822 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
823 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
824 setOperationAction(ISD::SELECT_CC, VT, Expand);
825 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
826 setTruncStoreAction(InnerVT, VT, Expand);
827
828 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
829 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
830
831 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
832 // types, we have to deal with them whether we ask for Expansion or not.
833 // Setting Expand causes its own optimisation problems though, so leave
834 // them legal.
835 if (VT.getVectorElementType() == MVT::i1)
836 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
837
838 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
839 // split/scalarized right now.
840 if (VT.getVectorElementType() == MVT::f16)
841 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
842 }
843 }
844
845 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
846 // with -msoft-float, disable use of MMX as well.
847 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
848 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
849 // No operations on x86mmx supported, everything uses intrinsics.
850 }
851
852 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
853 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
854 : &X86::VR128RegClass);
855
856 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
857 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
858 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
859 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
860 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
861 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
862 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
863 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
864
865 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
866 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
867
868 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
869 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
870 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
871 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
872 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
873 }
874
875 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
876 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
877 : &X86::VR128RegClass);
878
879 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
880 // registers cannot be used even for integer operations.
881 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
882 : &X86::VR128RegClass);
883 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
884 : &X86::VR128RegClass);
885 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
886 : &X86::VR128RegClass);
887 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
888 : &X86::VR128RegClass);
889
890 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
891 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
892 setOperationAction(ISD::SDIV, VT, Custom);
893 setOperationAction(ISD::SREM, VT, Custom);
894 setOperationAction(ISD::UDIV, VT, Custom);
895 setOperationAction(ISD::UREM, VT, Custom);
896 }
897
898 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
899 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
900 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
901
902 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
903 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
904 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
905 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
906 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
907 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
908 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
909 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
910 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
911 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
912 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
913 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
914 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
915
916 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
917 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
918 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
919 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
920 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
921 }
922
923 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
924 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
925 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
926 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
927 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
928 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
929 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
930 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
931 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
932 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
933
934 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
935 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
936 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
937
938 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
939 setOperationAction(ISD::SETCC, VT, Custom);
940 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
941 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
942 setOperationAction(ISD::CTPOP, VT, Custom);
943 setOperationAction(ISD::ABS, VT, Custom);
944
945 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
946 // setcc all the way to isel and prefer SETGT in some isel patterns.
947 setCondCodeAction(ISD::SETLT, VT, Custom);
948 setCondCodeAction(ISD::SETLE, VT, Custom);
949 }
950
951 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
952 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
953 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
954 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
955 setOperationAction(ISD::VSELECT, VT, Custom);
956 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
957 }
958
959 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
960 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
961 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
962 setOperationAction(ISD::VSELECT, VT, Custom);
963
964 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
965 continue;
966
967 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
968 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
969 }
970
971 // Custom lower v2i64 and v2f64 selects.
972 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
973 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
974 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
975 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
976 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
977
978 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
979 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
980 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
981 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
982
983 // Custom legalize these to avoid over promotion or custom promotion.
984 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
985 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
986 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
987 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
988 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
989 }
990
991 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
992 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
993 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
994 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
995
996 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
997 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
998
999 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1000 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1001
1002 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1003 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1004 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1005 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1006 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1007
1008 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1009 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1010 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1011 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1012
1013 // We want to legalize this to an f64 load rather than an i64 load on
1014 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1015 // store.
1016 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1017 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1018 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1019 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1020 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1021 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1022
1023 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1024 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1025 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1026 if (!Subtarget.hasAVX512())
1027 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1028
1029 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1030 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1031 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1032
1033 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1034
1035 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1036 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1037 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1038 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1039 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1040 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1041
1042 // In the customized shift lowering, the legal v4i32/v2i64 cases
1043 // in AVX2 will be recognized.
1044 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1045 setOperationAction(ISD::SRL, VT, Custom);
1046 setOperationAction(ISD::SHL, VT, Custom);
1047 setOperationAction(ISD::SRA, VT, Custom);
1048 }
1049
1050 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
1051 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
1052
1053 // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
1054 // shifts) is better.
1055 if (!Subtarget.useAVX512Regs() &&
1056 !(Subtarget.hasBWI() && Subtarget.hasVLX()))
1057 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
1058
1059 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1060 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1061 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1062 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1063 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1064 }
1065
1066 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1067 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1068 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1069 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1070 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1071 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1072 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1073 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1074 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1075
1076 // These might be better off as horizontal vector ops.
1077 setOperationAction(ISD::ADD, MVT::i16, Custom);
1078 setOperationAction(ISD::ADD, MVT::i32, Custom);
1079 setOperationAction(ISD::SUB, MVT::i16, Custom);
1080 setOperationAction(ISD::SUB, MVT::i32, Custom);
1081 }
1082
1083 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1084 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1085 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1086 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1087 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1088 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1089 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1090 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1091 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1092 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1093 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1094 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1095 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1096 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1097
1098 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1099 }
1100
1101 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1102 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1103 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1104 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1105 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1106 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1107 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1108 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1109
1110 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1111
1112 // FIXME: Do we need to handle scalar-to-vector here?
1113 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1114
1115 // We directly match byte blends in the backend as they match the VSELECT
1116 // condition form.
1117 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1118
1119 // SSE41 brings specific instructions for doing vector sign extend even in
1120 // cases where we don't have SRA.
1121 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1122 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1123 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1124 }
1125
1126 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1127 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1128 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1129 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1130 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1131 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1132 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1133 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1134 }
1135
1136 // i8 vectors are custom because the source register and source
1137 // source memory operand types are not the same width.
1138 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1139
1140 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1141 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1142 // do the pre and post work in the vector domain.
1143 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1144 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1145 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1146 // so that DAG combine doesn't try to turn it into uint_to_fp.
1147 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1148 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1149 }
1150 }
1151
1152 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1153 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1154 }
1155
1156 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1157 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1158 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1159 setOperationAction(ISD::ROTL, VT, Custom);
1160
1161 // XOP can efficiently perform BITREVERSE with VPPERM.
1162 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1163 setOperationAction(ISD::BITREVERSE, VT, Custom);
1164
1165 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1166 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1167 setOperationAction(ISD::BITREVERSE, VT, Custom);
1168 }
1169
1170 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1171 bool HasInt256 = Subtarget.hasInt256();
1172
1173 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1174 : &X86::VR256RegClass);
1175 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1176 : &X86::VR256RegClass);
1177 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1178 : &X86::VR256RegClass);
1179 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1180 : &X86::VR256RegClass);
1181 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1182 : &X86::VR256RegClass);
1183 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1184 : &X86::VR256RegClass);
1185
1186 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1187 setOperationAction(ISD::FFLOOR, VT, Legal);
1188 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1189 setOperationAction(ISD::FCEIL, VT, Legal);
1190 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1191 setOperationAction(ISD::FTRUNC, VT, Legal);
1192 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1193 setOperationAction(ISD::FRINT, VT, Legal);
1194 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1195 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1196 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1197 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1198 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1199
1200 setOperationAction(ISD::FROUND, VT, Custom);
1201
1202 setOperationAction(ISD::FNEG, VT, Custom);
1203 setOperationAction(ISD::FABS, VT, Custom);
1204 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1205 }
1206
1207 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1208 // even though v8i16 is a legal type.
1209 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1210 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1211 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1212 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1213 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1214 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
1215
1216 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1217 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
1218
1219 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1220 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1221 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1222 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1223 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1224 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1225 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1226 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1227 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1228 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
1229 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1230 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1231
1232 if (!Subtarget.hasAVX512())
1233 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1234
1235 // In the customized shift lowering, the legal v8i32/v4i64 cases
1236 // in AVX2 will be recognized.
1237 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1238 setOperationAction(ISD::SRL, VT, Custom);
1239 setOperationAction(ISD::SHL, VT, Custom);
1240 setOperationAction(ISD::SRA, VT, Custom);
1241 }
1242
1243 // These types need custom splitting if their input is a 128-bit vector.
1244 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1245 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1246 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1247 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1248
1249 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1250 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1251
1252 // With BWI, expanding (and promoting the shifts) is the better.
1253 if (!Subtarget.useBWIRegs())
1254 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1255
1256 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1257 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1258 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1259 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1260 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1261 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1262
1263 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1264 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1265 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1266 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1267 }
1268
1269 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1270 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1271 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1272 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1273
1274 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1275 setOperationAction(ISD::SETCC, VT, Custom);
1276 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1277 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1278 setOperationAction(ISD::CTPOP, VT, Custom);
1279 setOperationAction(ISD::CTLZ, VT, Custom);
1280
1281 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1282 // setcc all the way to isel and prefer SETGT in some isel patterns.
1283 setCondCodeAction(ISD::SETLT, VT, Custom);
1284 setCondCodeAction(ISD::SETLE, VT, Custom);
1285 }
1286
1287 if (Subtarget.hasAnyFMA()) {
1288 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1289 MVT::v2f64, MVT::v4f64 }) {
1290 setOperationAction(ISD::FMA, VT, Legal);
1291 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1292 }
1293 }
1294
1295 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1296 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1297 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1298 }
1299
1300 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1301 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1302 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1303 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1304
1305 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1306 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1307 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1308 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1309 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1310 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1311
1312 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1313 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1314 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1315 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1316 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1317
1318 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1319 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1320 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1321 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1322 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1323 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1324 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1325 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1326 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1327 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1328 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1329 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1330
1331 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1332 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1333 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1334 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1335 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1336 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1337 }
1338
1339 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1340 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1341 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1342 }
1343
1344 if (HasInt256) {
1345 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1346 // when we have a 256bit-wide blend with immediate.
1347 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1348 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1349
1350 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1351 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1352 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1353 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1354 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1355 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1356 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1357 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1358 }
1359 }
1360
1361 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1362 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1363 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1364 setOperationAction(ISD::MSTORE, VT, Legal);
1365 }
1366
1367 // Extract subvector is special because the value type
1368 // (result) is 128-bit but the source is 256-bit wide.
1369 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1370 MVT::v4f32, MVT::v2f64 }) {
1371 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1372 }
1373
1374 // Custom lower several nodes for 256-bit types.
1375 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1376 MVT::v8f32, MVT::v4f64 }) {
1377 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1378 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1379 setOperationAction(ISD::VSELECT, VT, Custom);
1380 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1381 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1382 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1383 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1384 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1385 setOperationAction(ISD::STORE, VT, Custom);
1386 }
1387
1388 if (HasInt256) {
1389 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1390
1391 // Custom legalize 2x32 to get a little better code.
1392 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1393 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1394
1395 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1396 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1397 setOperationAction(ISD::MGATHER, VT, Custom);
1398 }
1399 }
1400
1401 // This block controls legalization of the mask vector sizes that are
1402 // available with AVX512. 512-bit vectors are in a separate block controlled
1403 // by useAVX512Regs.
1404 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1405 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1406 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1407 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1408 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1409 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1410
1411 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1412 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1413 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1414
1415 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1416 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1417 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1418 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1419 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1420 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1421 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1422 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1423 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1424 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1425 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1426 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1427
1428 // There is no byte sized k-register load or store without AVX512DQ.
1429 if (!Subtarget.hasDQI()) {
1430 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1431 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1432 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1433 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1434
1435 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1436 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1437 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1438 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1439 }
1440
1441 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1442 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1443 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1444 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1445 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1446 }
1447
1448 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1449 setOperationAction(ISD::ADD, VT, Custom);
1450 setOperationAction(ISD::SUB, VT, Custom);
1451 setOperationAction(ISD::MUL, VT, Custom);
1452 setOperationAction(ISD::UADDSAT, VT, Custom);
1453 setOperationAction(ISD::SADDSAT, VT, Custom);
1454 setOperationAction(ISD::USUBSAT, VT, Custom);
1455 setOperationAction(ISD::SSUBSAT, VT, Custom);
1456 setOperationAction(ISD::VSELECT, VT, Expand);
1457 }
1458
1459 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1460 setOperationAction(ISD::SETCC, VT, Custom);
1461 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1462 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1463 setOperationAction(ISD::SELECT, VT, Custom);
1464 setOperationAction(ISD::TRUNCATE, VT, Custom);
1465
1466 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1467 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1468 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1469 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1470 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1471 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1472 }
1473
1474 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1475 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1476 }
1477
1478 // This block controls legalization for 512-bit operations with 32/64 bit
1479 // elements. 512-bits can be disabled based on prefer-vector-width and
1480 // required-vector-width function attributes.
1481 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1482 bool HasBWI = Subtarget.hasBWI();
1483
1484 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1485 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1486 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1487 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1488 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1489 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1490
1491 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1492 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1493 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1494 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1495 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1496 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1497 if (HasBWI)
1498 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1499 }
1500
1501 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1502 setOperationAction(ISD::FNEG, VT, Custom);
1503 setOperationAction(ISD::FABS, VT, Custom);
1504 setOperationAction(ISD::FMA, VT, Legal);
1505 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1506 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1507 }
1508
1509 for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1510 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1511 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1512 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1513 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1514 }
1515 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1516 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1517 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1518 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1519 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1520 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1521 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1522 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1523
1524 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1525 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1526 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1527 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1528 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1529 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1530 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1531 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1532 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1533 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1534 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
1535 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1536
1537 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1538 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1539 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1540 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1541 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1542 if (HasBWI)
1543 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1544
1545 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1546 // to 512-bit rather than use the AVX2 instructions so that we can use
1547 // k-masks.
1548 if (!Subtarget.hasVLX()) {
1549 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1550 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1551 setOperationAction(ISD::MLOAD, VT, Custom);
1552 setOperationAction(ISD::MSTORE, VT, Custom);
1553 }
1554 }
1555
1556 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1557 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1558 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1559 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1560 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1561 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1562 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1563 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1564 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1565 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1566 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1567 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1568 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1569
1570 if (HasBWI) {
1571 // Extends from v64i1 masks to 512-bit vectors.
1572 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1573 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1574 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1575 }
1576
1577 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1578 setOperationAction(ISD::FFLOOR, VT, Legal);
1579 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1580 setOperationAction(ISD::FCEIL, VT, Legal);
1581 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1582 setOperationAction(ISD::FTRUNC, VT, Legal);
1583 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1584 setOperationAction(ISD::FRINT, VT, Legal);
1585 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1586 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1587 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1588 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1589 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1590
1591 setOperationAction(ISD::FROUND, VT, Custom);
1592 }
1593
1594 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1595 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1596 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1597 }
1598
1599 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1600 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1601 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1602 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1603
1604 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1605 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1606 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1607 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1608
1609 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1610 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1611 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1612 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1613 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1614 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1615
1616 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1617
1618 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1619 setOperationAction(ISD::SRL, VT, Custom);
1620 setOperationAction(ISD::SHL, VT, Custom);
1621 setOperationAction(ISD::SRA, VT, Custom);
1622 setOperationAction(ISD::SETCC, VT, Custom);
1623
1624 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1625 // setcc all the way to isel and prefer SETGT in some isel patterns.
1626 setCondCodeAction(ISD::SETLT, VT, Custom);
1627 setCondCodeAction(ISD::SETLE, VT, Custom);
1628 }
1629 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1630 setOperationAction(ISD::SMAX, VT, Legal);
1631 setOperationAction(ISD::UMAX, VT, Legal);
1632 setOperationAction(ISD::SMIN, VT, Legal);
1633 setOperationAction(ISD::UMIN, VT, Legal);
1634 setOperationAction(ISD::ABS, VT, Legal);
1635 setOperationAction(ISD::CTPOP, VT, Custom);
1636 setOperationAction(ISD::ROTL, VT, Custom);
1637 setOperationAction(ISD::ROTR, VT, Custom);
1638 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1639 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1640 }
1641
1642 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1643 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1644 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1645 setOperationAction(ISD::CTLZ, VT, Custom);
1646 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1647 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1648 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1649 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1650 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1651 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1652 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1653 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1654 }
1655
1656 if (Subtarget.hasDQI()) {
1657 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1658 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1659 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1660 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1661 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1662 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1663 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1664 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1665
1666 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1667 }
1668
1669 if (Subtarget.hasCDI()) {
1670 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1671 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1672 setOperationAction(ISD::CTLZ, VT, Legal);
1673 }
1674 } // Subtarget.hasCDI()
1675
1676 if (Subtarget.hasVPOPCNTDQ()) {
1677 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1678 setOperationAction(ISD::CTPOP, VT, Legal);
1679 }
1680
1681 // Extract subvector is special because the value type
1682 // (result) is 256-bit but the source is 512-bit wide.
1683 // 128-bit was made Legal under AVX1.
1684 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1685 MVT::v8f32, MVT::v4f64 })
1686 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1687
1688 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1689 MVT::v16f32, MVT::v8f64 }) {
1690 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1691 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1692 setOperationAction(ISD::SELECT, VT, Custom);
1693 setOperationAction(ISD::VSELECT, VT, Custom);
1694 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1695 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1696 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1697 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1698 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1699 }
1700
1701 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1702 setOperationAction(ISD::MLOAD, VT, Legal);
1703 setOperationAction(ISD::MSTORE, VT, Legal);
1704 setOperationAction(ISD::MGATHER, VT, Custom);
1705 setOperationAction(ISD::MSCATTER, VT, Custom);
1706 }
1707 if (HasBWI) {
1708 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1709 setOperationAction(ISD::MLOAD, VT, Legal);
1710 setOperationAction(ISD::MSTORE, VT, Legal);
1711 }
1712 } else {
1713 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1714 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1715 }
1716
1717 if (Subtarget.hasVBMI2()) {
1718 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1719 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1720 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1721 setOperationAction(ISD::FSHL, VT, Custom);
1722 setOperationAction(ISD::FSHR, VT, Custom);
1723 }
1724
1725 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1726 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1727 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1728 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1729 }
1730 }// useAVX512Regs
1731
1732 // This block controls legalization for operations that don't have
1733 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1734 // narrower widths.
1735 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1736 // These operations are handled on non-VLX by artificially widening in
1737 // isel patterns.
1738
1739 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1740 Subtarget.hasVLX() ? Legal : Custom);
1741 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1742 Subtarget.hasVLX() ? Legal : Custom);
1743 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1744 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1745 Subtarget.hasVLX() ? Legal : Custom);
1746 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1747 Subtarget.hasVLX() ? Legal : Custom);
1748 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1749 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1750 Subtarget.hasVLX() ? Legal : Custom);
1751 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1752 Subtarget.hasVLX() ? Legal : Custom);
1753 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1754 Subtarget.hasVLX() ? Legal : Custom);
1755 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1756 Subtarget.hasVLX() ? Legal : Custom);
1757
1758 if (Subtarget.hasDQI()) {
1759 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1760 // v2f32 UINT_TO_FP is already custom under SSE2.
1761 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1763, __PRETTY_FUNCTION__))
1762 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1763, __PRETTY_FUNCTION__))
1763 "Unexpected operation action!")((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1763, __PRETTY_FUNCTION__))
;
1764 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1765 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1766 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1767 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1768 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1769 }
1770
1771 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1772 setOperationAction(ISD::SMAX, VT, Legal);
1773 setOperationAction(ISD::UMAX, VT, Legal);
1774 setOperationAction(ISD::SMIN, VT, Legal);
1775 setOperationAction(ISD::UMIN, VT, Legal);
1776 setOperationAction(ISD::ABS, VT, Legal);
1777 }
1778
1779 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1780 setOperationAction(ISD::ROTL, VT, Custom);
1781 setOperationAction(ISD::ROTR, VT, Custom);
1782 }
1783
1784 // Custom legalize 2x32 to get a little better code.
1785 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1786 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1787
1788 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1789 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1790 setOperationAction(ISD::MSCATTER, VT, Custom);
1791
1792 if (Subtarget.hasDQI()) {
1793 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1794 setOperationAction(ISD::SINT_TO_FP, VT,
1795 Subtarget.hasVLX() ? Legal : Custom);
1796 setOperationAction(ISD::UINT_TO_FP, VT,
1797 Subtarget.hasVLX() ? Legal : Custom);
1798 setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1799 Subtarget.hasVLX() ? Legal : Custom);
1800 setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1801 Subtarget.hasVLX() ? Legal : Custom);
1802 setOperationAction(ISD::FP_TO_SINT, VT,
1803 Subtarget.hasVLX() ? Legal : Custom);
1804 setOperationAction(ISD::FP_TO_UINT, VT,
1805 Subtarget.hasVLX() ? Legal : Custom);
1806 setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1807 Subtarget.hasVLX() ? Legal : Custom);
1808 setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1809 Subtarget.hasVLX() ? Legal : Custom);
1810 setOperationAction(ISD::MUL, VT, Legal);
1811 }
1812 }
1813
1814 if (Subtarget.hasCDI()) {
1815 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1816 setOperationAction(ISD::CTLZ, VT, Legal);
1817 }
1818 } // Subtarget.hasCDI()
1819
1820 if (Subtarget.hasVPOPCNTDQ()) {
1821 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1822 setOperationAction(ISD::CTPOP, VT, Legal);
1823 }
1824 }
1825
1826 // This block control legalization of v32i1/v64i1 which are available with
1827 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1828 // useBWIRegs.
1829 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1830 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1831 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1832
1833 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1834 setOperationAction(ISD::ADD, VT, Custom);
1835 setOperationAction(ISD::SUB, VT, Custom);
1836 setOperationAction(ISD::MUL, VT, Custom);
1837 setOperationAction(ISD::VSELECT, VT, Expand);
1838 setOperationAction(ISD::UADDSAT, VT, Custom);
1839 setOperationAction(ISD::SADDSAT, VT, Custom);
1840 setOperationAction(ISD::USUBSAT, VT, Custom);
1841 setOperationAction(ISD::SSUBSAT, VT, Custom);
1842
1843 setOperationAction(ISD::TRUNCATE, VT, Custom);
1844 setOperationAction(ISD::SETCC, VT, Custom);
1845 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1846 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1847 setOperationAction(ISD::SELECT, VT, Custom);
1848 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1849 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1850 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1851 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1852 }
1853
1854 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1855 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1856
1857 // Extends from v32i1 masks to 256-bit vectors.
1858 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1859 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1860 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1861
1862 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1863 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1864 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1865 }
1866
1867 // These operations are handled on non-VLX by artificially widening in
1868 // isel patterns.
1869 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1870
1871 if (Subtarget.hasBITALG()) {
1872 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1873 setOperationAction(ISD::CTPOP, VT, Legal);
1874 }
1875 }
1876
1877 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1878 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1879 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1880 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1881 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1882 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1883
1884 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1885 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1886 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1887 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1888 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1889
1890 if (Subtarget.hasBWI()) {
1891 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1892 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1893 }
1894
1895 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1896 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1897 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1898 }
1899
1900 if (Subtarget.hasAMXTILE()) {
1901 addRegisterClass(MVT::v256i32, &X86::TILERegClass);
1902 }
1903
1904 // We want to custom lower some of our intrinsics.
1905 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1906 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1907 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1908 if (!Subtarget.is64Bit()) {
1909 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1910 }
1911
1912 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1913 // handle type legalization for these operations here.
1914 //
1915 // FIXME: We really should do custom legalization for addition and
1916 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1917 // than generic legalization for 64-bit multiplication-with-overflow, though.
1918 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1919 if (VT == MVT::i64 && !Subtarget.is64Bit())
1920 continue;
1921 // Add/Sub/Mul with overflow operations are custom lowered.
1922 setOperationAction(ISD::SADDO, VT, Custom);
1923 setOperationAction(ISD::UADDO, VT, Custom);
1924 setOperationAction(ISD::SSUBO, VT, Custom);
1925 setOperationAction(ISD::USUBO, VT, Custom);
1926 setOperationAction(ISD::SMULO, VT, Custom);
1927 setOperationAction(ISD::UMULO, VT, Custom);
1928
1929 // Support carry in as value rather than glue.
1930 setOperationAction(ISD::ADDCARRY, VT, Custom);
1931 setOperationAction(ISD::SUBCARRY, VT, Custom);
1932 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1933 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
1934 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
1935 }
1936
1937 if (!Subtarget.is64Bit()) {
1938 // These libcalls are not available in 32-bit.
1939 setLibcallName(RTLIB::SHL_I128, nullptr);
1940 setLibcallName(RTLIB::SRL_I128, nullptr);
1941 setLibcallName(RTLIB::SRA_I128, nullptr);
1942 setLibcallName(RTLIB::MUL_I128, nullptr);
1943 }
1944
1945 // Combine sin / cos into _sincos_stret if it is available.
1946 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1947 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1948 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1949 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1950 }
1951
1952 if (Subtarget.isTargetWin64()) {
1953 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1954 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1955 setOperationAction(ISD::SREM, MVT::i128, Custom);
1956 setOperationAction(ISD::UREM, MVT::i128, Custom);
1957 }
1958
1959 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1960 // is. We should promote the value to 64-bits to solve this.
1961 // This is what the CRT headers do - `fmodf` is an inline header
1962 // function casting to f64 and calling `fmod`.
1963 if (Subtarget.is32Bit() &&
1964 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
1965 for (ISD::NodeType Op :
1966 {ISD::FCEIL, ISD::STRICT_FCEIL,
1967 ISD::FCOS, ISD::STRICT_FCOS,
1968 ISD::FEXP, ISD::STRICT_FEXP,
1969 ISD::FFLOOR, ISD::STRICT_FFLOOR,
1970 ISD::FREM, ISD::STRICT_FREM,
1971 ISD::FLOG, ISD::STRICT_FLOG,
1972 ISD::FLOG10, ISD::STRICT_FLOG10,
1973 ISD::FPOW, ISD::STRICT_FPOW,
1974 ISD::FSIN, ISD::STRICT_FSIN})
1975 if (isOperationExpand(Op, MVT::f32))
1976 setOperationAction(Op, MVT::f32, Promote);
1977
1978 // We have target-specific dag combine patterns for the following nodes:
1979 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1980 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1981 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
1982 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1983 setTargetDAGCombine(ISD::CONCAT_VECTORS);
1984 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1985 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1986 setTargetDAGCombine(ISD::BITCAST);
1987 setTargetDAGCombine(ISD::VSELECT);
1988 setTargetDAGCombine(ISD::SELECT);
1989 setTargetDAGCombine(ISD::SHL);
1990 setTargetDAGCombine(ISD::SRA);
1991 setTargetDAGCombine(ISD::SRL);
1992 setTargetDAGCombine(ISD::OR);
1993 setTargetDAGCombine(ISD::AND);
1994 setTargetDAGCombine(ISD::ADD);
1995 setTargetDAGCombine(ISD::FADD);
1996 setTargetDAGCombine(ISD::FSUB);
1997 setTargetDAGCombine(ISD::FNEG);
1998 setTargetDAGCombine(ISD::FMA);
1999 setTargetDAGCombine(ISD::STRICT_FMA);
2000 setTargetDAGCombine(ISD::FMINNUM);
2001 setTargetDAGCombine(ISD::FMAXNUM);
2002 setTargetDAGCombine(ISD::SUB);
2003 setTargetDAGCombine(ISD::LOAD);
2004 setTargetDAGCombine(ISD::MLOAD);
2005 setTargetDAGCombine(ISD::STORE);
2006 setTargetDAGCombine(ISD::MSTORE);
2007 setTargetDAGCombine(ISD::TRUNCATE);
2008 setTargetDAGCombine(ISD::ZERO_EXTEND);
2009 setTargetDAGCombine(ISD::ANY_EXTEND);
2010 setTargetDAGCombine(ISD::SIGN_EXTEND);
2011 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2012 setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2013 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2014 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2015 setTargetDAGCombine(ISD::SINT_TO_FP);
2016 setTargetDAGCombine(ISD::UINT_TO_FP);
2017 setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2018 setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2019 setTargetDAGCombine(ISD::SETCC);
2020 setTargetDAGCombine(ISD::MUL);
2021 setTargetDAGCombine(ISD::XOR);
2022 setTargetDAGCombine(ISD::MSCATTER);
2023 setTargetDAGCombine(ISD::MGATHER);
2024 setTargetDAGCombine(ISD::FP16_TO_FP);
2025 setTargetDAGCombine(ISD::FP_EXTEND);
2026 setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2027 setTargetDAGCombine(ISD::FP_ROUND);
2028
2029 computeRegisterProperties(Subtarget.getRegisterInfo());
2030
2031 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2032 MaxStoresPerMemsetOptSize = 8;
2033 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2034 MaxStoresPerMemcpyOptSize = 4;
2035 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2036 MaxStoresPerMemmoveOptSize = 4;
2037
2038 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2039 // that needs to benchmarked and balanced with the potential use of vector
2040 // load/store types (PR33329, PR33914).
2041 MaxLoadsPerMemcmp = 2;
2042 MaxLoadsPerMemcmpOptSize = 2;
2043
2044 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
2045 setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
2046
2047 // An out-of-order CPU can speculatively execute past a predictable branch,
2048 // but a conditional move could be stalled by an expensive earlier operation.
2049 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2050 EnableExtLdPromotion = true;
2051 setPrefFunctionAlignment(Align(16));
2052
2053 verifyIntrinsicTables();
2054
2055 // Default to having -disable-strictnode-mutation on
2056 IsStrictFPEnabled = true;
2057}
2058
2059// This has so far only been implemented for 64-bit MachO.
2060bool X86TargetLowering::useLoadStackGuardNode() const {
2061 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2062}
2063
2064bool X86TargetLowering::useStackGuardXorFP() const {
2065 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2066 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2067}
2068
2069SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2070 const SDLoc &DL) const {
2071 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2072 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2073 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2074 return SDValue(Node, 0);
2075}
2076
2077TargetLoweringBase::LegalizeTypeAction
2078X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2079 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2080 !Subtarget.hasBWI())
2081 return TypeSplitVector;
2082
2083 if (VT.getVectorNumElements() != 1 &&
2084 VT.getVectorElementType() != MVT::i1)
2085 return TypeWidenVector;
2086
2087 return TargetLoweringBase::getPreferredVectorAction(VT);
2088}
2089
2090static std::pair<MVT, unsigned>
2091handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2092 const X86Subtarget &Subtarget) {
2093 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2094 // convention is one that uses k registers.
2095 if (NumElts == 2)
2096 return {MVT::v2i64, 1};
2097 if (NumElts == 4)
2098 return {MVT::v4i32, 1};
2099 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2100 CC != CallingConv::Intel_OCL_BI)
2101 return {MVT::v8i16, 1};
2102 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2103 CC != CallingConv::Intel_OCL_BI)
2104 return {MVT::v16i8, 1};
2105 // v32i1 passes in ymm unless we have BWI and the calling convention is
2106 // regcall.
2107 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2108 return {MVT::v32i8, 1};
2109 // Split v64i1 vectors if we don't have v64i8 available.
2110 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2111 if (Subtarget.useAVX512Regs())
2112 return {MVT::v64i8, 1};
2113 return {MVT::v32i8, 2};
2114 }
2115
2116 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2117 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2118 NumElts > 64)
2119 return {MVT::i8, NumElts};
2120
2121 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2122}
2123
2124MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2125 CallingConv::ID CC,
2126 EVT VT) const {
2127 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2128 Subtarget.hasAVX512()) {
2129 unsigned NumElts = VT.getVectorNumElements();
2130
2131 MVT RegisterVT;
2132 unsigned NumRegisters;
2133 std::tie(RegisterVT, NumRegisters) =
2134 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2135 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2136 return RegisterVT;
2137 }
2138
2139 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2140}
2141
2142unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2143 CallingConv::ID CC,
2144 EVT VT) const {
2145 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2146 Subtarget.hasAVX512()) {
2147 unsigned NumElts = VT.getVectorNumElements();
2148
2149 MVT RegisterVT;
2150 unsigned NumRegisters;
2151 std::tie(RegisterVT, NumRegisters) =
2152 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2153 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2154 return NumRegisters;
2155 }
2156
2157 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2158}
2159
2160unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2161 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2162 unsigned &NumIntermediates, MVT &RegisterVT) const {
2163 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2164 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2165 Subtarget.hasAVX512() &&
2166 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2167 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2168 VT.getVectorNumElements() > 64)) {
2169 RegisterVT = MVT::i8;
2170 IntermediateVT = MVT::i1;
2171 NumIntermediates = VT.getVectorNumElements();
2172 return NumIntermediates;
2173 }
2174
2175 // Split v64i1 vectors if we don't have v64i8 available.
2176 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2177 CC != CallingConv::X86_RegCall) {
2178 RegisterVT = MVT::v32i8;
2179 IntermediateVT = MVT::v32i1;
2180 NumIntermediates = 2;
2181 return 2;
2182 }
2183
2184 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2185 NumIntermediates, RegisterVT);
2186}
2187
2188EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2189 LLVMContext& Context,
2190 EVT VT) const {
2191 if (!VT.isVector())
2192 return MVT::i8;
2193
2194 if (Subtarget.hasAVX512()) {
2195 const unsigned NumElts = VT.getVectorNumElements();
2196
2197 // Figure out what this type will be legalized to.
2198 EVT LegalVT = VT;
2199 while (getTypeAction(Context, LegalVT) != TypeLegal)
2200 LegalVT = getTypeToTransformTo(Context, LegalVT);
2201
2202 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2203 if (LegalVT.getSimpleVT().is512BitVector())
2204 return EVT::getVectorVT(Context, MVT::i1, NumElts);
2205
2206 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2207 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2208 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2209 // vXi16/vXi8.
2210 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2211 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2212 return EVT::getVectorVT(Context, MVT::i1, NumElts);
2213 }
2214 }
2215
2216 return VT.changeVectorElementTypeToInteger();
2217}
2218
2219/// Helper for getByValTypeAlignment to determine
2220/// the desired ByVal argument alignment.
2221static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2222 if (MaxAlign == 16)
2223 return;
2224 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2225 if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2226 MaxAlign = Align(16);
2227 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2228 Align EltAlign;
2229 getMaxByValAlign(ATy->getElementType(), EltAlign);
2230 if (EltAlign > MaxAlign)
2231 MaxAlign = EltAlign;
2232 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2233 for (auto *EltTy : STy->elements()) {
2234 Align EltAlign;
2235 getMaxByValAlign(EltTy, EltAlign);
2236 if (EltAlign > MaxAlign)
2237 MaxAlign = EltAlign;
2238 if (MaxAlign == 16)
2239 break;
2240 }
2241 }
2242}
2243
2244/// Return the desired alignment for ByVal aggregate
2245/// function arguments in the caller parameter area. For X86, aggregates
2246/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2247/// are at 4-byte boundaries.
2248unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2249 const DataLayout &DL) const {
2250 if (Subtarget.is64Bit()) {
2251 // Max of 8 and alignment of type.
2252 Align TyAlign = DL.getABITypeAlign(Ty);
2253 if (TyAlign > 8)
2254 return TyAlign.value();
2255 return 8;
2256 }
2257
2258 Align Alignment(4);
2259 if (Subtarget.hasSSE1())
2260 getMaxByValAlign(Ty, Alignment);
2261 return Alignment.value();
2262}
2263
2264/// It returns EVT::Other if the type should be determined using generic
2265/// target-independent logic.
2266/// For vector ops we check that the overall size isn't larger than our
2267/// preferred vector width.
2268EVT X86TargetLowering::getOptimalMemOpType(
2269 const MemOp &Op, const AttributeList &FuncAttributes) const {
2270 if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2271 if (Op.size() >= 16 &&
2272 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2273 // FIXME: Check if unaligned 64-byte accesses are slow.
2274 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2275 (Subtarget.getPreferVectorWidth() >= 512)) {
2276 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2277 }
2278 // FIXME: Check if unaligned 32-byte accesses are slow.
2279 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2280 (Subtarget.getPreferVectorWidth() >= 256)) {
2281 // Although this isn't a well-supported type for AVX1, we'll let
2282 // legalization and shuffle lowering produce the optimal codegen. If we
2283 // choose an optimal type with a vector element larger than a byte,
2284 // getMemsetStores() may create an intermediate splat (using an integer
2285 // multiply) before we splat as a vector.
2286 return MVT::v32i8;
2287 }
2288 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2289 return MVT::v16i8;
2290 // TODO: Can SSE1 handle a byte vector?
2291 // If we have SSE1 registers we should be able to use them.
2292 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2293 (Subtarget.getPreferVectorWidth() >= 128))
2294 return MVT::v4f32;
2295 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2296 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2297 // Do not use f64 to lower memcpy if source is string constant. It's
2298 // better to use i32 to avoid the loads.
2299 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2300 // The gymnastics of splatting a byte value into an XMM register and then
2301 // only using 8-byte stores (because this is a CPU with slow unaligned
2302 // 16-byte accesses) makes that a loser.
2303 return MVT::f64;
2304 }
2305 }
2306 // This is a compromise. If we reach here, unaligned accesses may be slow on
2307 // this target. However, creating smaller, aligned accesses could be even
2308 // slower and would certainly be a lot more code.
2309 if (Subtarget.is64Bit() && Op.size() >= 8)
2310 return MVT::i64;
2311 return MVT::i32;
2312}
2313
2314bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2315 if (VT == MVT::f32)
2316 return X86ScalarSSEf32;
2317 else if (VT == MVT::f64)
2318 return X86ScalarSSEf64;
2319 return true;
2320}
2321
2322bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2323 EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
2324 bool *Fast) const {
2325 if (Fast) {
2326 switch (VT.getSizeInBits()) {
2327 default:
2328 // 8-byte and under are always assumed to be fast.
2329 *Fast = true;
2330 break;
2331 case 128:
2332 *Fast = !Subtarget.isUnalignedMem16Slow();
2333 break;
2334 case 256:
2335 *Fast = !Subtarget.isUnalignedMem32Slow();
2336 break;
2337 // TODO: What about AVX-512 (512-bit) accesses?
2338 }
2339 }
2340 // NonTemporal vector memory ops must be aligned.
2341 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2342 // NT loads can only be vector aligned, so if its less aligned than the
2343 // minimum vector size (which we can split the vector down to), we might as
2344 // well use a regular unaligned vector load.
2345 // We don't have any NT loads pre-SSE41.
2346 if (!!(Flags & MachineMemOperand::MOLoad))
2347 return (Align < 16 || !Subtarget.hasSSE41());
2348 return false;
2349 }
2350 // Misaligned accesses of any size are always allowed.
2351 return true;
2352}
2353
2354/// Return the entry encoding for a jump table in the
2355/// current function. The returned value is a member of the
2356/// MachineJumpTableInfo::JTEntryKind enum.
2357unsigned X86TargetLowering::getJumpTableEncoding() const {
2358 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2359 // symbol.
2360 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2361 return MachineJumpTableInfo::EK_Custom32;
2362
2363 // Otherwise, use the normal jump table encoding heuristics.
2364 return TargetLowering::getJumpTableEncoding();
2365}
2366
2367bool X86TargetLowering::useSoftFloat() const {
2368 return Subtarget.useSoftFloat();
2369}
2370
2371void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2372 ArgListTy &Args) const {
2373
2374 // Only relabel X86-32 for C / Stdcall CCs.
2375 if (Subtarget.is64Bit())
2376 return;
2377 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2378 return;
2379 unsigned ParamRegs = 0;
2380 if (auto *M = MF->getFunction().getParent())
2381 ParamRegs = M->getNumberRegisterParameters();
2382
2383 // Mark the first N int arguments as having reg
2384 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
2385 Type *T = Args[Idx].Ty;
2386 if (T->isIntOrPtrTy())
2387 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2388 unsigned numRegs = 1;
2389 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2390 numRegs = 2;
2391 if (ParamRegs < numRegs)
2392 return;
2393 ParamRegs -= numRegs;
2394 Args[Idx].IsInReg = true;
2395 }
2396 }
2397}
2398
2399const MCExpr *
2400X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2401 const MachineBasicBlock *MBB,
2402 unsigned uid,MCContext &Ctx) const{
2403 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())((isPositionIndependent() && Subtarget.isPICStyleGOT(
)) ? static_cast<void> (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2403, __PRETTY_FUNCTION__))
;
2404 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2405 // entries.
2406 return MCSymbolRefExpr::create(MBB->getSymbol(),
2407 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2408}
2409
2410/// Returns relocation base for the given PIC jumptable.
2411SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2412 SelectionDAG &DAG) const {
2413 if (!Subtarget.is64Bit())
2414 // This doesn't have SDLoc associated with it, but is not really the
2415 // same as a Register.
2416 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2417 getPointerTy(DAG.getDataLayout()));
2418 return Table;
2419}
2420
2421/// This returns the relocation base for the given PIC jumptable,
2422/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2423const MCExpr *X86TargetLowering::
2424getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2425 MCContext &Ctx) const {
2426 // X86-64 uses RIP relative addressing based on the jump table label.
2427 if (Subtarget.isPICStyleRIPRel())
2428 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2429
2430 // Otherwise, the reference is relative to the PIC base.
2431 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2432}
2433
2434std::pair<const TargetRegisterClass *, uint8_t>
2435X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2436 MVT VT) const {
2437 const TargetRegisterClass *RRC = nullptr;
2438 uint8_t Cost = 1;
2439 switch (VT.SimpleTy) {
2440 default:
2441 return TargetLowering::findRepresentativeClass(TRI, VT);
2442 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2443 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2444 break;
2445 case MVT::x86mmx:
2446 RRC = &X86::VR64RegClass;
2447 break;
2448 case MVT::f32: case MVT::f64:
2449 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2450 case MVT::v4f32: case MVT::v2f64:
2451 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2452 case MVT::v8f32: case MVT::v4f64:
2453 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2454 case MVT::v16f32: case MVT::v8f64:
2455 RRC = &X86::VR128XRegClass;
2456 break;
2457 }
2458 return std::make_pair(RRC, Cost);
2459}
2460
2461unsigned X86TargetLowering::getAddressSpace() const {
2462 if (Subtarget.is64Bit())
2463 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2464 return 256;
2465}
2466
2467static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2468 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2469 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2470}
2471
2472static Constant* SegmentOffset(IRBuilder<> &IRB,
2473 unsigned Offset, unsigned AddressSpace) {
2474 return ConstantExpr::getIntToPtr(
2475 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2476 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2477}
2478
2479Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2480 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2481 // tcbhead_t; use it instead of the usual global variable (see
2482 // sysdeps/{i386,x86_64}/nptl/tls.h)
2483 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2484 if (Subtarget.isTargetFuchsia()) {
2485 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2486 return SegmentOffset(IRB, 0x10, getAddressSpace());
2487 } else {
2488 unsigned AddressSpace = getAddressSpace();
2489 // Specially, some users may customize the base reg and offset.
2490 unsigned Offset = getTargetMachine().Options.StackProtectorGuardOffset;
2491 // If we don't set -stack-protector-guard-offset value:
2492 // %fs:0x28, unless we're using a Kernel code model, in which case
2493 // it's %gs:0x28. gs:0x14 on i386.
2494 if (Offset == (unsigned)-1)
2495 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2496
2497 auto GuardReg = getTargetMachine().Options.StackProtectorGuardReg;
2498 if (GuardReg == "fs")
2499 AddressSpace = X86AS::FS;
2500 else if (GuardReg == "gs")
2501 AddressSpace = X86AS::GS;
2502 return SegmentOffset(IRB, Offset, AddressSpace);
2503 }
2504 }
2505 return TargetLowering::getIRStackGuard(IRB);
2506}
2507
2508void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2509 // MSVC CRT provides functionalities for stack protection.
2510 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2511 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2512 // MSVC CRT has a global variable holding security cookie.
2513 M.getOrInsertGlobal("__security_cookie",
2514 Type::getInt8PtrTy(M.getContext()));
2515
2516 // MSVC CRT has a function to validate security cookie.
2517 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2518 "__security_check_cookie", Type::getVoidTy(M.getContext()),
2519 Type::getInt8PtrTy(M.getContext()));
2520 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2521 F->setCallingConv(CallingConv::X86_FastCall);
2522 F->addAttribute(1, Attribute::AttrKind::InReg);
2523 }
2524 return;
2525 }
2526
2527 auto GuardMode = getTargetMachine().Options.StackProtectorGuard;
2528
2529 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2530 if ((GuardMode == llvm::StackProtectorGuards::TLS ||
2531 GuardMode == llvm::StackProtectorGuards::None)
2532 && hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2533 return;
2534 TargetLowering::insertSSPDeclarations(M);
2535}
2536
2537Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2538 // MSVC CRT has a global variable holding security cookie.
2539 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2540 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2541 return M.getGlobalVariable("__security_cookie");
2542 }
2543 return TargetLowering::getSDagStackGuard(M);
2544}
2545
2546Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2547 // MSVC CRT has a function to validate security cookie.
2548 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2549 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2550 return M.getFunction("__security_check_cookie");
2551 }
2552 return TargetLowering::getSSPStackGuardCheck(M);
2553}
2554
2555Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2556 if (Subtarget.getTargetTriple().isOSContiki())
2557 return getDefaultSafeStackPointerLocation(IRB, false);
2558
2559 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2560 // definition of TLS_SLOT_SAFESTACK in
2561 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2562 if (Subtarget.isTargetAndroid()) {
2563 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2564 // %gs:0x24 on i386
2565 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2566 return SegmentOffset(IRB, Offset, getAddressSpace());
2567 }
2568
2569 // Fuchsia is similar.
2570 if (Subtarget.isTargetFuchsia()) {
2571 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2572 return SegmentOffset(IRB, 0x18, getAddressSpace());
2573 }
2574
2575 return TargetLowering::getSafeStackPointerLocation(IRB);
2576}
2577
2578//===----------------------------------------------------------------------===//
2579// Return Value Calling Convention Implementation
2580//===----------------------------------------------------------------------===//
2581
2582bool X86TargetLowering::CanLowerReturn(
2583 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2584 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2585 SmallVector<CCValAssign, 16> RVLocs;
2586 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2587 return CCInfo.CheckReturn(Outs, RetCC_X86);
2588}
2589
2590const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2591 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2592 return ScratchRegs;
2593}
2594
2595/// Lowers masks values (v*i1) to the local register values
2596/// \returns DAG node after lowering to register type
2597static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2598 const SDLoc &Dl, SelectionDAG &DAG) {
2599 EVT ValVT = ValArg.getValueType();
2600
2601 if (ValVT == MVT::v1i1)
2602 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2603 DAG.getIntPtrConstant(0, Dl));
2604
2605 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2606 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2607 // Two stage lowering might be required
2608 // bitcast: v8i1 -> i8 / v16i1 -> i16
2609 // anyextend: i8 -> i32 / i16 -> i32
2610 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2611 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2612 if (ValLoc == MVT::i32)
2613 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2614 return ValToCopy;
2615 }
2616
2617 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2618 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2619 // One stage lowering is required
2620 // bitcast: v32i1 -> i32 / v64i1 -> i64
2621 return DAG.getBitcast(ValLoc, ValArg);
2622 }
2623
2624 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2625}
2626
2627/// Breaks v64i1 value into two registers and adds the new node to the DAG
2628static void Passv64i1ArgInRegs(
2629 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2630 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2631 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2632 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((Subtarget.hasBWI() && "Expected AVX512BW target!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2632, __PRETTY_FUNCTION__))
;
2633 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2633, __PRETTY_FUNCTION__))
;
2634 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")((Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"
) ? static_cast<void> (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2634, __PRETTY_FUNCTION__))
;
2635 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2636, __PRETTY_FUNCTION__))
2636 "The value should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2636, __PRETTY_FUNCTION__))
;
2637
2638 // Before splitting the value we cast it to i64
2639 Arg = DAG.getBitcast(MVT::i64, Arg);
2640
2641 // Splitting the value into two i32 types
2642 SDValue Lo, Hi;
2643 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2644 DAG.getConstant(0, Dl, MVT::i32));
2645 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2646 DAG.getConstant(1, Dl, MVT::i32));
2647
2648 // Attach the two i32 types into corresponding registers
2649 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2650 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2651}
2652
2653SDValue
2654X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2655 bool isVarArg,
2656 const SmallVectorImpl<ISD::OutputArg> &Outs,
2657 const SmallVectorImpl<SDValue> &OutVals,
2658 const SDLoc &dl, SelectionDAG &DAG) const {
2659 MachineFunction &MF = DAG.getMachineFunction();
2660 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2661
2662 // In some cases we need to disable registers from the default CSR list.
2663 // For example, when they are used for argument passing.
2664 bool ShouldDisableCalleeSavedRegister =
2665 CallConv == CallingConv::X86_RegCall ||
2666 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2667
2668 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2669 report_fatal_error("X86 interrupts may not return any value");
2670
2671 SmallVector<CCValAssign, 16> RVLocs;
2672 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2673 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2674
2675 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2676 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2677 ++I, ++OutsIndex) {
2678 CCValAssign &VA = RVLocs[I];
2679 assert(VA.isRegLoc() && "Can only return in registers!")((VA.isRegLoc() && "Can only return in registers!") ?
static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2679, __PRETTY_FUNCTION__))
;
2680
2681 // Add the register to the CalleeSaveDisableRegs list.
2682 if (ShouldDisableCalleeSavedRegister)
2683 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2684
2685 SDValue ValToCopy = OutVals[OutsIndex];
2686 EVT ValVT = ValToCopy.getValueType();
2687
2688 // Promote values to the appropriate types.
2689 if (VA.getLocInfo() == CCValAssign::SExt)
2690 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2691 else if (VA.getLocInfo() == CCValAssign::ZExt)
2692 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2693 else if (VA.getLocInfo() == CCValAssign::AExt) {
2694 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2695 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2696 else
2697 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2698 }
2699 else if (VA.getLocInfo() == CCValAssign::BCvt)
2700 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2701
2702 assert(VA.getLocInfo() != CCValAssign::FPExt &&((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2703, __PRETTY_FUNCTION__))
2703 "Unexpected FP-extend for return value.")((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2703, __PRETTY_FUNCTION__))
;
2704
2705 // Report an error if we have attempted to return a value via an XMM
2706 // register and SSE was disabled.
2707 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2708 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2709 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2710 } else if (!Subtarget.hasSSE2() &&
2711 X86::FR64XRegClass.contains(VA.getLocReg()) &&
2712 ValVT == MVT::f64) {
2713 // When returning a double via an XMM register, report an error if SSE2 is
2714 // not enabled.
2715 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2716 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2717 }
2718
2719 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2720 // the RET instruction and handled by the FP Stackifier.
2721 if (VA.getLocReg() == X86::FP0 ||
2722 VA.getLocReg() == X86::FP1) {
2723 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2724 // change the value to the FP stack register class.
2725 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2726 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2727 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2728 // Don't emit a copytoreg.
2729 continue;
2730 }
2731
2732 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2733 // which is returned in RAX / RDX.
2734 if (Subtarget.is64Bit()) {
2735 if (ValVT == MVT::x86mmx) {
2736 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2737 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2738 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2739 ValToCopy);
2740 // If we don't have SSE2 available, convert to v4f32 so the generated
2741 // register is legal.
2742 if (!Subtarget.hasSSE2())
2743 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2744 }
2745 }
2746 }
2747
2748 if (VA.needsCustom()) {
2749 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2750, __PRETTY_FUNCTION__))
2750 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2750, __PRETTY_FUNCTION__))
;
2751
2752 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2753 Subtarget);
2754
2755 // Add the second register to the CalleeSaveDisableRegs list.
2756 if (ShouldDisableCalleeSavedRegister)
2757 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2758 } else {
2759 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2760 }
2761 }
2762
2763 SDValue Flag;
2764 SmallVector<SDValue, 6> RetOps;
2765 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2766 // Operand #1 = Bytes To Pop
2767 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2768 MVT::i32));
2769
2770 // Copy the result values into the output registers.
2771 for (auto &RetVal : RetVals) {
2772 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2773 RetOps.push_back(RetVal.second);
2774 continue; // Don't emit a copytoreg.
2775 }
2776
2777 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2778 Flag = Chain.getValue(1);
2779 RetOps.push_back(
2780 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2781 }
2782
2783 // Swift calling convention does not require we copy the sret argument
2784 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2785
2786 // All x86 ABIs require that for returning structs by value we copy
2787 // the sret argument into %rax/%eax (depending on ABI) for the return.
2788 // We saved the argument into a virtual register in the entry block,
2789 // so now we copy the value out and into %rax/%eax.
2790 //
2791 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2792 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2793 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2794 // either case FuncInfo->setSRetReturnReg() will have been called.
2795 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
2796 // When we have both sret and another return value, we should use the
2797 // original Chain stored in RetOps[0], instead of the current Chain updated
2798 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2799
2800 // For the case of sret and another return value, we have
2801 // Chain_0 at the function entry
2802 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2803 // If we use Chain_1 in getCopyFromReg, we will have
2804 // Val = getCopyFromReg(Chain_1)
2805 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2806
2807 // getCopyToReg(Chain_0) will be glued together with
2808 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2809 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2810 // Data dependency from Unit B to Unit A due to usage of Val in
2811 // getCopyToReg(Chain_1, Val)
2812 // Chain dependency from Unit A to Unit B
2813
2814 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2815 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2816 getPointerTy(MF.getDataLayout()));
2817
2818 Register RetValReg
2819 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2820 X86::RAX : X86::EAX;
2821 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2822 Flag = Chain.getValue(1);
2823
2824 // RAX/EAX now acts like a return value.
2825 RetOps.push_back(
2826 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2827
2828 // Add the returned register to the CalleeSaveDisableRegs list.
2829 if (ShouldDisableCalleeSavedRegister)
2830 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2831 }
2832
2833 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2834 const MCPhysReg *I =
2835 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2836 if (I) {
2837 for (; *I; ++I) {
2838 if (X86::GR64RegClass.contains(*I))
2839 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2840 else
2841 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2841)
;
2842 }
2843 }
2844
2845 RetOps[0] = Chain; // Update chain.
2846
2847 // Add the flag if we have it.
2848 if (Flag.getNode())
2849 RetOps.push_back(Flag);
2850
2851 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2852 if (CallConv == CallingConv::X86_INTR)
2853 opcode = X86ISD::IRET;
2854 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2855}
2856
2857bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2858 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2859 return false;
2860
2861 SDValue TCChain = Chain;
2862 SDNode *Copy = *N->use_begin();
2863 if (Copy->getOpcode() == ISD::CopyToReg) {
2864 // If the copy has a glue operand, we conservatively assume it isn't safe to
2865 // perform a tail call.
2866 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2867 return false;
2868 TCChain = Copy->getOperand(0);
2869 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2870 return false;
2871
2872 bool HasRet = false;
2873 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2874 UI != UE; ++UI) {
2875 if (UI->getOpcode() != X86ISD::RET_FLAG)
2876 return false;
2877 // If we are returning more than one value, we can definitely
2878 // not make a tail call see PR19530
2879 if (UI->getNumOperands() > 4)
2880 return false;
2881 if (UI->getNumOperands() == 4 &&
2882 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2883 return false;
2884 HasRet = true;
2885 }
2886
2887 if (!HasRet)
2888 return false;
2889
2890 Chain = TCChain;
2891 return true;
2892}
2893
2894EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2895 ISD::NodeType ExtendKind) const {
2896 MVT ReturnMVT = MVT::i32;
2897
2898 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2899 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2900 // The ABI does not require i1, i8 or i16 to be extended.
2901 //
2902 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2903 // always extending i8/i16 return values, so keep doing that for now.
2904 // (PR26665).
2905 ReturnMVT = MVT::i8;
2906 }
2907
2908 EVT MinVT = getRegisterType(Context, ReturnMVT);
2909 return VT.bitsLT(MinVT) ? MinVT : VT;
2910}
2911
2912/// Reads two 32 bit registers and creates a 64 bit mask value.
2913/// \param VA The current 32 bit value that need to be assigned.
2914/// \param NextVA The next 32 bit value that need to be assigned.
2915/// \param Root The parent DAG node.
2916/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2917/// glue purposes. In the case the DAG is already using
2918/// physical register instead of virtual, we should glue
2919/// our new SDValue to InFlag SDvalue.
2920/// \return a new SDvalue of size 64bit.
2921static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2922 SDValue &Root, SelectionDAG &DAG,
2923 const SDLoc &Dl, const X86Subtarget &Subtarget,
2924 SDValue *InFlag = nullptr) {
2925 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2925, __PRETTY_FUNCTION__))
;
2926 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2926, __PRETTY_FUNCTION__))
;
2927 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2928, __PRETTY_FUNCTION__))
2928 "Expecting first location of 64 bit width type")((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2928, __PRETTY_FUNCTION__))
;
2929 assert(NextVA.getValVT() == VA.getValVT() &&((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"
) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2930, __PRETTY_FUNCTION__))
2930 "The locations should have the same type")((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"
) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2930, __PRETTY_FUNCTION__))
;
2931 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2932, __PRETTY_FUNCTION__))
2932 "The values should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2932, __PRETTY_FUNCTION__))
;
2933
2934 SDValue Lo, Hi;
2935 SDValue ArgValueLo, ArgValueHi;
2936
2937 MachineFunction &MF = DAG.getMachineFunction();
2938 const TargetRegisterClass *RC = &X86::GR32RegClass;
2939
2940 // Read a 32 bit value from the registers.
2941 if (nullptr == InFlag) {
2942 // When no physical register is present,
2943 // create an intermediate virtual register.
2944 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
2945 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2946 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2947 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2948 } else {
2949 // When a physical register is available read the value from it and glue
2950 // the reads together.
2951 ArgValueLo =
2952 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2953 *InFlag = ArgValueLo.getValue(2);
2954 ArgValueHi =
2955 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2956 *InFlag = ArgValueHi.getValue(2);
2957 }
2958
2959 // Convert the i32 type into v32i1 type.
2960 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2961
2962 // Convert the i32 type into v32i1 type.
2963 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2964
2965 // Concatenate the two values together.
2966 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2967}
2968
2969/// The function will lower a register of various sizes (8/16/32/64)
2970/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2971/// \returns a DAG node contains the operand after lowering to mask type.
2972static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2973 const EVT &ValLoc, const SDLoc &Dl,
2974 SelectionDAG &DAG) {
2975 SDValue ValReturned = ValArg;
2976
2977 if (ValVT == MVT::v1i1)
2978 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2979
2980 if (ValVT == MVT::v64i1) {
2981 // In 32 bit machine, this case is handled by getv64i1Argument
2982 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")((ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? static_cast<void> (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2982, __PRETTY_FUNCTION__))
;
2983 // In 64 bit machine, There is no need to truncate the value only bitcast
2984 } else {
2985 MVT maskLen;
2986 switch (ValVT.getSimpleVT().SimpleTy) {
2987 case MVT::v8i1:
2988 maskLen = MVT::i8;
2989 break;
2990 case MVT::v16i1:
2991 maskLen = MVT::i16;
2992 break;
2993 case MVT::v32i1:
2994 maskLen = MVT::i32;
2995 break;
2996 default:
2997 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2997)
;
2998 }
2999
3000 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3001 }
3002 return DAG.getBitcast(ValVT, ValReturned);
3003}
3004
3005/// Lower the result values of a call into the
3006/// appropriate copies out of appropriate physical registers.
3007///
3008SDValue X86TargetLowering::LowerCallResult(
3009 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3010 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3011 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3012 uint32_t *RegMask) const {
3013
3014 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3015 // Assign locations to each value returned by this call.
3016 SmallVector<CCValAssign, 16> RVLocs;
3017 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3018 *DAG.getContext());
3019 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3020
3021 // Copy all of the result registers out of their specified physreg.
3022 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3023 ++I, ++InsIndex) {
3024 CCValAssign &VA = RVLocs[I];
3025 EVT CopyVT = VA.getLocVT();
3026
3027 // In some calling conventions we need to remove the used registers
3028 // from the register mask.
3029 if (RegMask) {
3030 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3031 SubRegs.isValid(); ++SubRegs)
3032 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3033 }
3034
3035 // Report an error if there was an attempt to return FP values via XMM
3036 // registers.
3037 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3038 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3039 if (VA.getLocReg() == X86::XMM1)
3040 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3041 else
3042 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3043 } else if (!Subtarget.hasSSE2() &&
3044 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3045 CopyVT == MVT::f64) {
3046 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3047 if (VA.getLocReg() == X86::XMM1)
3048 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3049 else
3050 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3051 }
3052
3053 // If we prefer to use the value in xmm registers, copy it out as f80 and
3054 // use a truncate to move it from fp stack reg to xmm reg.
3055 bool RoundAfterCopy = false;
3056 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3057 isScalarFPTypeInSSEReg(VA.getValVT())) {
3058 if (!Subtarget.hasX87())
3059 report_fatal_error("X87 register return with X87 disabled");
3060 CopyVT = MVT::f80;
3061 RoundAfterCopy = (CopyVT != VA.getLocVT());
3062 }
3063
3064 SDValue Val;
3065 if (VA.needsCustom()) {
3066 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3067, __PRETTY_FUNCTION__))
3067 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3067, __PRETTY_FUNCTION__))
;
3068 Val =
3069 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3070 } else {
3071 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3072 .getValue(1);
3073 Val = Chain.getValue(0);
3074 InFlag = Chain.getValue(2);
3075 }
3076
3077 if (RoundAfterCopy)
3078 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3079 // This truncation won't change the value.
3080 DAG.getIntPtrConstant(1, dl));
3081
3082 if (VA.isExtInLoc()) {
3083 if (VA.getValVT().isVector() &&
3084 VA.getValVT().getScalarType() == MVT::i1 &&
3085 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3086 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3087 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3088 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3089 } else
3090 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3091 }
3092
3093 if (VA.getLocInfo() == CCValAssign::BCvt)
3094 Val = DAG.getBitcast(VA.getValVT(), Val);
3095
3096 InVals.push_back(Val);
3097 }
3098
3099 return Chain;
3100}
3101
3102//===----------------------------------------------------------------------===//
3103// C & StdCall & Fast Calling Convention implementation
3104//===----------------------------------------------------------------------===//
3105// StdCall calling convention seems to be standard for many Windows' API
3106// routines and around. It differs from C calling convention just a little:
3107// callee should clean up the stack, not caller. Symbols should be also
3108// decorated in some fancy way :) It doesn't support any vector arguments.
3109// For info on fast calling convention see Fast Calling Convention (tail call)
3110// implementation LowerX86_32FastCCCallTo.
3111
3112/// CallIsStructReturn - Determines whether a call uses struct return
3113/// semantics.
3114enum StructReturnType {
3115 NotStructReturn,
3116 RegStructReturn,
3117 StackStructReturn
3118};
3119static StructReturnType
3120callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3121 if (Outs.empty())
3122 return NotStructReturn;
3123
3124 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3125 if (!Flags.isSRet())
3126 return NotStructReturn;
3127 if (Flags.isInReg() || IsMCU)
3128 return RegStructReturn;
3129 return StackStructReturn;
3130}
3131
3132/// Determines whether a function uses struct return semantics.
3133static StructReturnType
3134argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3135 if (Ins.empty())
3136 return NotStructReturn;
3137
3138 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3139 if (!Flags.isSRet())
3140 return NotStructReturn;
3141 if (Flags.isInReg() || IsMCU)
3142 return RegStructReturn;
3143 return StackStructReturn;
3144}
3145
3146/// Make a copy of an aggregate at address specified by "Src" to address
3147/// "Dst" with size and alignment information specified by the specific
3148/// parameter attribute. The copy will be passed as a byval function parameter.
3149static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3150 SDValue Chain, ISD::ArgFlagsTy Flags,
3151 SelectionDAG &DAG, const SDLoc &dl) {
3152 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3153
3154 return DAG.getMemcpy(
3155 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3156 /*isVolatile*/ false, /*AlwaysInline=*/true,
3157 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3158}
3159
3160/// Return true if the calling convention is one that we can guarantee TCO for.
3161static bool canGuaranteeTCO(CallingConv::ID CC) {
3162 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3163 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3164 CC == CallingConv::HHVM || CC == CallingConv::Tail);
3165}
3166
3167/// Return true if we might ever do TCO for calls with this calling convention.
3168static bool mayTailCallThisCC(CallingConv::ID CC) {
3169 switch (CC) {
3170 // C calling conventions:
3171 case CallingConv::C:
3172 case CallingConv::Win64:
3173 case CallingConv::X86_64_SysV:
3174 // Callee pop conventions:
3175 case CallingConv::X86_ThisCall:
3176 case CallingConv::X86_StdCall:
3177 case CallingConv::X86_VectorCall:
3178 case CallingConv::X86_FastCall:
3179 // Swift:
3180 case CallingConv::Swift:
3181 return true;
3182 default:
3183 return canGuaranteeTCO(CC);
3184 }
3185}
3186
3187/// Return true if the function is being made into a tailcall target by
3188/// changing its ABI.
3189static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3190 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
3191}
3192
3193bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3194 if (!CI->isTailCall())
3195 return false;
3196
3197 CallingConv::ID CalleeCC = CI->getCallingConv();
3198 if (!mayTailCallThisCC(CalleeCC))
3199 return false;
3200
3201 return true;
3202}
3203
3204SDValue
3205X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3206 const SmallVectorImpl<ISD::InputArg> &Ins,
3207 const SDLoc &dl, SelectionDAG &DAG,
3208 const CCValAssign &VA,
3209 MachineFrameInfo &MFI, unsigned i) const {
3210 // Create the nodes corresponding to a load from this parameter slot.
3211 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3212 bool AlwaysUseMutable = shouldGuaranteeTCO(
3213 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3214 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3215 EVT ValVT;
3216 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3217
3218 // If value is passed by pointer we have address passed instead of the value
3219 // itself. No need to extend if the mask value and location share the same
3220 // absolute size.
3221 bool ExtendedInMem =
3222 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3223 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3224
3225 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3226 ValVT = VA.getLocVT();
3227 else
3228 ValVT = VA.getValVT();
3229
3230 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3231 // changed with more analysis.
3232 // In case of tail call optimization mark all arguments mutable. Since they
3233 // could be overwritten by lowering of arguments in case of a tail call.
3234 if (Flags.isByVal()) {
3235 unsigned Bytes = Flags.getByValSize();
3236 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3237
3238 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3239 // can be improved with deeper analysis.
3240 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3241 /*isAliased=*/true);
3242 return DAG.getFrameIndex(FI, PtrVT);
3243 }
3244
3245 EVT ArgVT = Ins[i].ArgVT;
3246
3247 // If this is a vector that has been split into multiple parts, and the
3248 // scalar size of the parts don't match the vector element size, then we can't
3249 // elide the copy. The parts will have padding between them instead of being
3250 // packed like a vector.
3251 bool ScalarizedAndExtendedVector =
3252 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3253 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3254
3255 // This is an argument in memory. We might be able to perform copy elision.
3256 // If the argument is passed directly in memory without any extension, then we
3257 // can perform copy elision. Large vector types, for example, may be passed
3258 // indirectly by pointer.
3259 if (Flags.isCopyElisionCandidate() &&
3260 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3261 !ScalarizedAndExtendedVector) {
3262 SDValue PartAddr;
3263 if (Ins[i].PartOffset == 0) {
3264 // If this is a one-part value or the first part of a multi-part value,
3265 // create a stack object for the entire argument value type and return a
3266 // load from our portion of it. This assumes that if the first part of an
3267 // argument is in memory, the rest will also be in memory.
3268 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3269 /*IsImmutable=*/false);
3270 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3271 return DAG.getLoad(
3272 ValVT, dl, Chain, PartAddr,
3273 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3274 } else {
3275 // This is not the first piece of an argument in memory. See if there is
3276 // already a fixed stack object including this offset. If so, assume it
3277 // was created by the PartOffset == 0 branch above and create a load from
3278 // the appropriate offset into it.
3279 int64_t PartBegin = VA.getLocMemOffset();
3280 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3281 int FI = MFI.getObjectIndexBegin();
3282 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3283 int64_t ObjBegin = MFI.getObjectOffset(FI);
3284 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3285 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3286 break;
3287 }
3288 if (MFI.isFixedObjectIndex(FI)) {
3289 SDValue Addr =
3290 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3291 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3292 return DAG.getLoad(
3293 ValVT, dl, Chain, Addr,
3294 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3295 Ins[i].PartOffset));
3296 }
3297 }
3298 }
3299
3300 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3301 VA.getLocMemOffset(), isImmutable);
3302
3303 // Set SExt or ZExt flag.
3304 if (VA.getLocInfo() == CCValAssign::ZExt) {
3305 MFI.setObjectZExt(FI, true);
3306 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3307 MFI.setObjectSExt(FI, true);
3308 }
3309
3310 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3311 SDValue Val = DAG.getLoad(
3312 ValVT, dl, Chain, FIN,
3313 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3314 return ExtendedInMem
3315 ? (VA.getValVT().isVector()
3316 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3317 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3318 : Val;
3319}
3320
3321// FIXME: Get this from tablegen.
3322static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3323 const X86Subtarget &Subtarget) {
3324 assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3324, __PRETTY_FUNCTION__))
;
3325
3326 if (Subtarget.isCallingConvWin64(CallConv)) {
3327 static const MCPhysReg GPR64ArgRegsWin64[] = {
3328 X86::RCX, X86::RDX, X86::R8, X86::R9
3329 };
3330 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3331 }
3332
3333 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3334 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3335 };
3336 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3337}
3338
3339// FIXME: Get this from tablegen.
3340static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3341 CallingConv::ID CallConv,
3342 const X86Subtarget &Subtarget) {
3343 assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3343, __PRETTY_FUNCTION__))
;
3344 if (Subtarget.isCallingConvWin64(CallConv)) {
3345 // The XMM registers which might contain var arg parameters are shadowed
3346 // in their paired GPR. So we only need to save the GPR to their home
3347 // slots.
3348 // TODO: __vectorcall will change this.
3349 return None;
3350 }
3351
3352 const Function &F = MF.getFunction();
3353 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
3354 bool isSoftFloat = Subtarget.useSoftFloat();
3355 assert(!(isSoftFloat && NoImplicitFloatOps) &&((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3356, __PRETTY_FUNCTION__))
3356 "SSE register cannot be used when SSE is disabled!")((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3356, __PRETTY_FUNCTION__))
;
3357 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
3358 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3359 // registers.
3360 return None;
3361
3362 static const MCPhysReg XMMArgRegs64Bit[] = {
3363 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3364 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3365 };
3366 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3367}
3368
3369#ifndef NDEBUG
3370static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3371 return llvm::is_sorted(
3372 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3373 return A.getValNo() < B.getValNo();
3374 });
3375}
3376#endif
3377
3378namespace {
3379/// This is a helper class for lowering variable arguments parameters.
3380class VarArgsLoweringHelper {
3381public:
3382 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3383 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3384 CallingConv::ID CallConv, CCState &CCInfo)
3385 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3386 TheMachineFunction(DAG.getMachineFunction()),
3387 TheFunction(TheMachineFunction.getFunction()),
3388 FrameInfo(TheMachineFunction.getFrameInfo()),
3389 FrameLowering(*Subtarget.getFrameLowering()),
3390 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3391 CCInfo(CCInfo) {}
3392
3393 // Lower variable arguments parameters.
3394 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3395
3396private:
3397 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3398
3399 void forwardMustTailParameters(SDValue &Chain);
3400
3401 bool is64Bit() const { return Subtarget.is64Bit(); }
3402 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3403
3404 X86MachineFunctionInfo *FuncInfo;
3405 const SDLoc &DL;
3406 SelectionDAG &DAG;
3407 const X86Subtarget &Subtarget;
3408 MachineFunction &TheMachineFunction;
3409 const Function &TheFunction;
3410 MachineFrameInfo &FrameInfo;
3411 const TargetFrameLowering &FrameLowering;
3412 const TargetLowering &TargLowering;
3413 CallingConv::ID CallConv;
3414 CCState &CCInfo;
3415};
3416} // namespace
3417
3418void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3419 SDValue &Chain, unsigned StackSize) {
3420 // If the function takes variable number of arguments, make a frame index for
3421 // the start of the first vararg value... for expansion of llvm.va_start. We
3422 // can skip this if there are no va_start calls.
3423 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3424 CallConv != CallingConv::X86_ThisCall)) {
3425 FuncInfo->setVarArgsFrameIndex(
3426 FrameInfo.CreateFixedObject(1, StackSize, true));
3427 }
3428
3429 // Figure out if XMM registers are in use.
3430 assert(!(Subtarget.useSoftFloat() &&((!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute
(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3432, __PRETTY_FUNCTION__))
3431 TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&((!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute
(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3432, __PRETTY_FUNCTION__))
3432 "SSE register cannot be used when SSE is disabled!")((!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute
(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3432, __PRETTY_FUNCTION__))
;
3433
3434 // 64-bit calling conventions support varargs and register parameters, so we
3435 // have to do extra work to spill them in the prologue.
3436 if (is64Bit()) {
3437 // Find the first unallocated argument registers.
3438 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3439 ArrayRef<MCPhysReg> ArgXMMs =
3440 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3441 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3442 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3443
3444 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3445, __PRETTY_FUNCTION__))
3445 "SSE register cannot be used when SSE is disabled!")((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3445, __PRETTY_FUNCTION__))
;
3446
3447 if (isWin64()) {
3448 // Get to the caller-allocated home save location. Add 8 to account
3449 // for the return address.
3450 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3451 FuncInfo->setRegSaveFrameIndex(
3452 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3453 // Fixup to set vararg frame on shadow area (4 x i64).
3454 if (NumIntRegs < 4)
3455 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3456 } else {
3457 // For X86-64, if there are vararg parameters that are passed via
3458 // registers, then we must store them to their spots on the stack so
3459 // they may be loaded by dereferencing the result of va_next.
3460 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3461 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3462 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3463 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3464 }
3465
3466 SmallVector<SDValue, 6>
3467 LiveGPRs; // list of SDValue for GPR registers keeping live input value
3468 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3469 // keeping live input value
3470 SDValue ALVal; // if applicable keeps SDValue for %al register
3471
3472 // Gather all the live in physical registers.
3473 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3474 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3475 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3476 }
3477 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3478 if (!AvailableXmms.empty()) {
3479 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3480 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3481 for (MCPhysReg Reg : AvailableXmms) {
3482 Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass);
3483 LiveXMMRegs.push_back(
3484 DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32));
3485 }
3486 }
3487
3488 // Store the integer parameter registers.
3489 SmallVector<SDValue, 8> MemOps;
3490 SDValue RSFIN =
3491 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3492 TargLowering.getPointerTy(DAG.getDataLayout()));
3493 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3494 for (SDValue Val : LiveGPRs) {
3495 SDValue FIN = DAG.getNode(ISD::ADD, DL,
3496 TargLowering.getPointerTy(DAG.getDataLayout()),
3497 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3498 SDValue Store =
3499 DAG.getStore(Val.getValue(1), DL, Val, FIN,
3500 MachinePointerInfo::getFixedStack(
3501 DAG.getMachineFunction(),
3502 FuncInfo->getRegSaveFrameIndex(), Offset));
3503 MemOps.push_back(Store);
3504 Offset += 8;
3505 }
3506
3507 // Now store the XMM (fp + vector) parameter registers.
3508 if (!LiveXMMRegs.empty()) {
3509 SmallVector<SDValue, 12> SaveXMMOps;
3510 SaveXMMOps.push_back(Chain);
3511 SaveXMMOps.push_back(ALVal);
3512 SaveXMMOps.push_back(
3513 DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
3514 SaveXMMOps.push_back(
3515 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3516 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3517 LiveXMMRegs.end());
3518 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
3519 MVT::Other, SaveXMMOps));
3520 }
3521
3522 if (!MemOps.empty())
3523 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3524 }
3525}
3526
3527void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3528 // Find the largest legal vector type.
3529 MVT VecVT = MVT::Other;
3530 // FIXME: Only some x86_32 calling conventions support AVX512.
3531 if (Subtarget.useAVX512Regs() &&
3532 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3533 CallConv == CallingConv::Intel_OCL_BI)))
3534 VecVT = MVT::v16f32;
3535 else if (Subtarget.hasAVX())
3536 VecVT = MVT::v8f32;
3537 else if (Subtarget.hasSSE2())
3538 VecVT = MVT::v4f32;
3539
3540 // We forward some GPRs and some vector types.
3541 SmallVector<MVT, 2> RegParmTypes;
3542 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3543 RegParmTypes.push_back(IntVT);
3544 if (VecVT != MVT::Other)
3545 RegParmTypes.push_back(VecVT);
3546
3547 // Compute the set of forwarded registers. The rest are scratch.
3548 SmallVectorImpl<ForwardedRegister> &Forwards =
3549 FuncInfo->getForwardedMustTailRegParms();
3550 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3551
3552 // Forward AL for SysV x86_64 targets, since it is used for varargs.
3553 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3554 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3555 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3556 }
3557
3558 // Copy all forwards from physical to virtual registers.
3559 for (ForwardedRegister &FR : Forwards) {
3560 // FIXME: Can we use a less constrained schedule?
3561 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3562 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3563 TargLowering.getRegClassFor(FR.VT));
3564 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3565 }
3566}
3567
3568void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3569 unsigned StackSize) {
3570 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3571 // If necessary, it would be set into the correct value later.
3572 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3573 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3574
3575 if (FrameInfo.hasVAStart())
3576 createVarArgAreaAndStoreRegisters(Chain, StackSize);
3577
3578 if (FrameInfo.hasMustTailInVarArgFunc())
3579 forwardMustTailParameters(Chain);
3580}
3581
3582SDValue X86TargetLowering::LowerFormalArguments(
3583 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3584 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3585 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3586 MachineFunction &MF = DAG.getMachineFunction();
3587 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3588
3589 const Function &F = MF.getFunction();
3590 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3591 F.getName() == "main")
3592 FuncInfo->setForceFramePointer(true);
3593
3594 MachineFrameInfo &MFI = MF.getFrameInfo();
3595 bool Is64Bit = Subtarget.is64Bit();
3596 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3597
3598 assert(((!(IsVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3600, __PRETTY_FUNCTION__))
3599 !(IsVarArg && canGuaranteeTCO(CallConv)) &&((!(IsVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3600, __PRETTY_FUNCTION__))
3600 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")((!(IsVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3600, __PRETTY_FUNCTION__))
;
3601
3602 // Assign locations to all of the incoming arguments.
3603 SmallVector<CCValAssign, 16> ArgLocs;
3604 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3605
3606 // Allocate shadow area for Win64.
3607 if (IsWin64)
3608 CCInfo.AllocateStack(32, Align(8));
3609
3610 CCInfo.AnalyzeArguments(Ins, CC_X86);
3611
3612 // In vectorcall calling convention a second pass is required for the HVA
3613 // types.
3614 if (CallingConv::X86_VectorCall == CallConv) {
3615 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3616 }
3617
3618 // The next loop assumes that the locations are in the same order of the
3619 // input arguments.
3620 assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3621, __PRETTY_FUNCTION__))
3621 "Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3621, __PRETTY_FUNCTION__))
;
3622
3623 SDValue ArgValue;
3624 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3625 ++I, ++InsIndex) {
3626 assert(InsIndex < Ins.size() && "Invalid Ins index")((InsIndex < Ins.size() && "Invalid Ins index") ? static_cast
<void> (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3626, __PRETTY_FUNCTION__))
;
3627 CCValAssign &VA = ArgLocs[I];
3628
3629 if (VA.isRegLoc()) {
3630 EVT RegVT = VA.getLocVT();
3631 if (VA.needsCustom()) {
3632 assert(((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3634, __PRETTY_FUNCTION__))
3633 VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3634, __PRETTY_FUNCTION__))
3634 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3634, __PRETTY_FUNCTION__))
;
3635
3636 // v64i1 values, in regcall calling convention, that are
3637 // compiled to 32 bit arch, are split up into two registers.
3638 ArgValue =
3639 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3640 } else {
3641 const TargetRegisterClass *RC;
3642 if (RegVT == MVT::i8)
3643 RC = &X86::GR8RegClass;
3644 else if (RegVT == MVT::i16)
3645 RC = &X86::GR16RegClass;
3646 else if (RegVT == MVT::i32)
3647 RC = &X86::GR32RegClass;
3648 else if (Is64Bit && RegVT == MVT::i64)
3649 RC = &X86::GR64RegClass;
3650 else if (RegVT == MVT::f32)
3651 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3652 else if (RegVT == MVT::f64)
3653 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3654 else if (RegVT == MVT::f80)
3655 RC = &X86::RFP80RegClass;
3656 else if (RegVT == MVT::f128)
3657 RC = &X86::VR128RegClass;
3658 else if (RegVT.is512BitVector())
3659 RC = &X86::VR512RegClass;
3660 else if (RegVT.is256BitVector())
3661 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3662 else if (RegVT.is128BitVector())
3663 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3664 else if (RegVT == MVT::x86mmx)
3665 RC = &X86::VR64RegClass;
3666 else if (RegVT == MVT::v1i1)
3667 RC = &X86::VK1RegClass;
3668 else if (RegVT == MVT::v8i1)
3669 RC = &X86::VK8RegClass;
3670 else if (RegVT == MVT::v16i1)
3671 RC = &X86::VK16RegClass;
3672 else if (RegVT == MVT::v32i1)
3673 RC = &X86::VK32RegClass;
3674 else if (RegVT == MVT::v64i1)
3675 RC = &X86::VK64RegClass;
3676 else
3677 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3677)
;
3678
3679 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3680 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3681 }
3682
3683 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3684 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3685 // right size.
3686 if (VA.getLocInfo() == CCValAssign::SExt)
3687 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3688 DAG.getValueType(VA.getValVT()));
3689 else if (VA.getLocInfo() == CCValAssign::ZExt)
3690 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3691 DAG.getValueType(VA.getValVT()));
3692 else if (VA.getLocInfo() == CCValAssign::BCvt)
3693 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3694
3695 if (VA.isExtInLoc()) {
3696 // Handle MMX values passed in XMM regs.
3697 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3698 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3699 else if (VA.getValVT().isVector() &&
3700 VA.getValVT().getScalarType() == MVT::i1 &&
3701 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3702 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3703 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3704 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3705 } else
3706 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3707 }
3708 } else {
3709 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3709, __PRETTY_FUNCTION__))
;
3710 ArgValue =
3711 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3712 }
3713
3714 // If value is passed via pointer - do a load.
3715 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3716 ArgValue =
3717 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3718
3719 InVals.push_back(ArgValue);
3720 }
3721
3722 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3723 // Swift calling convention does not require we copy the sret argument
3724 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3725 if (CallConv == CallingConv::Swift)
3726 continue;
3727
3728 // All x86 ABIs require that for returning structs by value we copy the
3729 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3730 // the argument into a virtual register so that we can access it from the
3731 // return points.
3732 if (Ins[I].Flags.isSRet()) {
3733 Register Reg = FuncInfo->getSRetReturnReg();
3734 if (!Reg) {
3735 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3736 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3737 FuncInfo->setSRetReturnReg(Reg);
3738 }
3739 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3740 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3741 break;
3742 }
3743 }
3744
3745 unsigned StackSize = CCInfo.getNextStackOffset();
3746 // Align stack specially for tail calls.
3747 if (shouldGuaranteeTCO(CallConv,
3748 MF.getTarget().Options.GuaranteedTailCallOpt))
3749 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3750
3751 if (IsVarArg)
3752 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
3753 .lowerVarArgsParameters(Chain, StackSize);
3754
3755 // Some CCs need callee pop.
3756 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
3757 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3758 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3759 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3760 // X86 interrupts must pop the error code (and the alignment padding) if
3761 // present.
3762 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3763 } else {
3764 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3765 // If this is an sret function, the return should pop the hidden pointer.
3766 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3767 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3768 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3769 FuncInfo->setBytesToPopOnReturn(4);
3770 }
3771
3772 if (!Is64Bit) {
3773 // RegSaveFrameIndex is X86-64 only.
3774 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3775 }
3776
3777 FuncInfo->setArgumentStackSize(StackSize);
3778
3779 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3780 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3781 if (Personality == EHPersonality::CoreCLR) {
3782 assert(Is64Bit)((Is64Bit) ? static_cast<void> (0) : __assert_fail ("Is64Bit"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3782, __PRETTY_FUNCTION__))
;
3783 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3784 // that we'd prefer this slot be allocated towards the bottom of the frame
3785 // (i.e. near the stack pointer after allocating the frame). Every
3786 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3787 // offset from the bottom of this and each funclet's frame must be the
3788 // same, so the size of funclets' (mostly empty) frames is dictated by
3789 // how far this slot is from the bottom (since they allocate just enough
3790 // space to accommodate holding this slot at the correct offset).
3791 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
3792 EHInfo->PSPSymFrameIdx = PSPSymFI;
3793 }
3794 }
3795
3796 if (CallConv == CallingConv::X86_RegCall ||
3797 F.hasFnAttribute("no_caller_saved_registers")) {
3798 MachineRegisterInfo &MRI = MF.getRegInfo();
3799 for (std::pair<Register, Register> Pair : MRI.liveins())
3800 MRI.disableCalleeSavedRegister(Pair.first);
3801 }
3802
3803 return Chain;
3804}
3805
3806SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3807 SDValue Arg, const SDLoc &dl,
3808 SelectionDAG &DAG,
3809 const CCValAssign &VA,
3810 ISD::ArgFlagsTy Flags,
3811 bool isByVal) const {
3812 unsigned LocMemOffset = VA.getLocMemOffset();
3813 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3814 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3815 StackPtr, PtrOff);
3816 if (isByVal)
3817 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3818
3819 return DAG.getStore(
3820 Chain, dl, Arg, PtrOff,
3821 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3822}
3823
3824/// Emit a load of return address if tail call
3825/// optimization is performed and it is required.
3826SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3827 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3828 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3829 // Adjust the Return address stack slot.
3830 EVT VT = getPointerTy(DAG.getDataLayout());
3831 OutRetAddr = getReturnAddressFrameIndex(DAG);
3832
3833 // Load the "old" Return address.
3834 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3835 return SDValue(OutRetAddr.getNode(), 1);
3836}
3837
3838/// Emit a store of the return address if tail call
3839/// optimization is performed and it is required (FPDiff!=0).
3840static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3841 SDValue Chain, SDValue RetAddrFrIdx,
3842 EVT PtrVT, unsigned SlotSize,
3843 int FPDiff, const SDLoc &dl) {
3844 // Store the return address to the appropriate stack slot.
3845 if (!FPDiff) return Chain;
3846 // Calculate the new stack slot for the return address.
3847 int NewReturnAddrFI =
3848 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3849 false);
3850 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3851 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3852 MachinePointerInfo::getFixedStack(
3853 DAG.getMachineFunction(), NewReturnAddrFI));
3854 return Chain;
3855}
3856
3857/// Returns a vector_shuffle mask for an movs{s|d}, movd
3858/// operation of specified width.
3859static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3860 SDValue V2) {
3861 unsigned NumElems = VT.getVectorNumElements();
3862 SmallVector<int, 8> Mask;
3863 Mask.push_back(NumElems);
3864 for (unsigned i = 1; i != NumElems; ++i)
3865 Mask.push_back(i);
3866 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3867}
3868
3869SDValue
3870X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3871 SmallVectorImpl<SDValue> &InVals) const {
3872 SelectionDAG &DAG = CLI.DAG;
3873 SDLoc &dl = CLI.DL;
3874 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3875 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3876 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3877 SDValue Chain = CLI.Chain;
3878 SDValue Callee = CLI.Callee;
3879 CallingConv::ID CallConv = CLI.CallConv;
3880 bool &isTailCall = CLI.IsTailCall;
3881 bool isVarArg = CLI.IsVarArg;
3882
3883 MachineFunction &MF = DAG.getMachineFunction();
3884 bool Is64Bit = Subtarget.is64Bit();
3885 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3886 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3887 bool IsSibcall = false;
3888 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
3889 CallConv == CallingConv::Tail;
3890 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3891 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB);
3892 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3893 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3894 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3895 const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
3896 bool HasNoCfCheck =
3897 (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
3898 bool IsIndirectCall = (CI && CI->isIndirectCall());
3899 const Module *M = MF.getMMI().getModule();
3900 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3901
3902 MachineFunction::CallSiteInfo CSInfo;
3903 if (CallConv == CallingConv::X86_INTR)
3904 report_fatal_error("X86 interrupts may not be called directly");
3905
3906 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
3907 // If we are using a GOT, disable tail calls to external symbols with
3908 // default visibility. Tail calling such a symbol requires using a GOT
3909 // relocation, which forces early binding of the symbol. This breaks code
3910 // that require lazy function symbol resolution. Using musttail or
3911 // GuaranteedTailCallOpt will override this.
3912 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3913 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3914 G->getGlobal()->hasDefaultVisibility()))
3915 isTailCall = false;
3916 }
3917
3918 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
3919 if (IsMustTail) {
3920 // Force this to be a tail call. The verifier rules are enough to ensure
3921 // that we can lower this successfully without moving the return address
3922 // around.
3923 isTailCall = true;
3924 } else if (isTailCall) {
3925 // Check if it's really possible to do a tail call.
3926 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3927 isVarArg, SR != NotStructReturn,
3928 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3929 Outs, OutVals, Ins, DAG);
3930
3931 // Sibcalls are automatically detected tailcalls which do not require
3932 // ABI changes.
3933 if (!IsGuaranteeTCO && isTailCall)
3934 IsSibcall = true;
3935
3936 if (isTailCall)
3937 ++NumTailCalls;
3938 }
3939
3940 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3941, __PRETTY_FUNCTION__))
3941 "Var args not supported with calling convention fastcc, ghc or hipe")((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3941, __PRETTY_FUNCTION__))
;
3942
3943 // Analyze operands of the call, assigning locations to each operand.
3944 SmallVector<CCValAssign, 16> ArgLocs;
3945 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3946
3947 // Allocate shadow area for Win64.
3948 if (IsWin64)
3949 CCInfo.AllocateStack(32, Align(8));
3950
3951 CCInfo.AnalyzeArguments(Outs, CC_X86);
3952
3953 // In vectorcall calling convention a second pass is required for the HVA
3954 // types.
3955 if (CallingConv::X86_VectorCall == CallConv) {
3956 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3957 }
3958
3959 // Get a count of how many bytes are to be pushed on the stack.
3960 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3961 if (IsSibcall)
3962 // This is a sibcall. The memory operands are available in caller's
3963 // own caller's stack.
3964 NumBytes = 0;
3965 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
3966 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3967
3968 int FPDiff = 0;
3969 if (isTailCall && !IsSibcall && !IsMustTail) {
3970 // Lower arguments at fp - stackoffset + fpdiff.
3971 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3972
3973 FPDiff = NumBytesCallerPushed - NumBytes;
3974
3975 // Set the delta of movement of the returnaddr stackslot.
3976 // But only set if delta is greater than previous delta.
3977 if (FPDiff < X86Info->getTCReturnAddrDelta())
3978 X86Info->setTCReturnAddrDelta(FPDiff);
3979 }
3980
3981 unsigned NumBytesToPush = NumBytes;
3982 unsigned NumBytesToPop = NumBytes;
3983
3984 // If we have an inalloca argument, all stack space has already been allocated
3985 // for us and be right at the top of the stack. We don't support multiple
3986 // arguments passed in memory when using inalloca.
3987 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3988 NumBytesToPush = 0;
3989 if (!ArgLocs.back().isMemLoc())
3990 report_fatal_error("cannot use inalloca attribute on a register "
3991 "parameter");
3992 if (ArgLocs.back().getLocMemOffset() != 0)
3993 report_fatal_error("any parameter with the inalloca attribute must be "
3994 "the only memory argument");
3995 } else if (CLI.IsPreallocated) {
3996 assert(ArgLocs.back().isMemLoc() &&((ArgLocs.back().isMemLoc() && "cannot use preallocated attribute on a register "
"parameter") ? static_cast<void> (0) : __assert_fail (
"ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3998, __PRETTY_FUNCTION__))
3997 "cannot use preallocated attribute on a register "((ArgLocs.back().isMemLoc() && "cannot use preallocated attribute on a register "
"parameter") ? static_cast<void> (0) : __assert_fail (
"ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3998, __PRETTY_FUNCTION__))
3998 "parameter")((ArgLocs.back().isMemLoc() && "cannot use preallocated attribute on a register "
"parameter") ? static_cast<void> (0) : __assert_fail (
"ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3998, __PRETTY_FUNCTION__))
;
3999 SmallVector<size_t, 4> PreallocatedOffsets;
4000 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4001 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4002 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4003 }
4004 }
4005 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4006 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4007 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4008 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4009 NumBytesToPush = 0;
4010 }
4011
4012 if (!IsSibcall && !IsMustTail)
4013 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4014 NumBytes - NumBytesToPush, dl);
4015
4016 SDValue RetAddrFrIdx;
4017 // Load return address for tail calls.
4018 if (isTailCall && FPDiff)
4019 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4020 Is64Bit, FPDiff, dl);
4021
4022 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4023 SmallVector<SDValue, 8> MemOpChains;
4024 SDValue StackPtr;
4025
4026 // The next loop assumes that the locations are in the same order of the
4027 // input arguments.
4028 assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4029, __PRETTY_FUNCTION__))
4029 "Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4029, __PRETTY_FUNCTION__))
;
4030
4031 // Walk the register/memloc assignments, inserting copies/loads. In the case
4032 // of tail call optimization arguments are handle later.
4033 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4034 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4035 ++I, ++OutIndex) {
4036 assert(OutIndex < Outs.size() && "Invalid Out index")((OutIndex < Outs.size() && "Invalid Out index") ?
static_cast<void> (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4036, __PRETTY_FUNCTION__))
;
4037 // Skip inalloca/preallocated arguments, they have already been written.
4038 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4039 if (Flags.isInAlloca() || Flags.isPreallocated())
4040 continue;
4041
4042 CCValAssign &VA = ArgLocs[I];
4043 EVT RegVT = VA.getLocVT();
4044 SDValue Arg = OutVals[OutIndex];
4045 bool isByVal = Flags.isByVal();
4046
4047 // Promote the value if needed.
4048 switch (VA.getLocInfo()) {
4049 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4049)
;
4050 case CCValAssign::Full: break;
4051 case CCValAssign::SExt:
4052 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4053 break;
4054 case CCValAssign::ZExt:
4055 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4056 break;
4057 case CCValAssign::AExt:
4058 if (Arg.getValueType().isVector() &&
4059 Arg.getValueType().getVectorElementType() == MVT::i1)
4060 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4061 else if (RegVT.is128BitVector()) {
4062 // Special case: passing MMX values in XMM registers.
4063 Arg = DAG.getBitcast(MVT::i64, Arg);
4064 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4065 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4066 } else
4067 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4068 break;
4069 case CCValAssign::BCvt:
4070 Arg = DAG.getBitcast(RegVT, Arg);
4071 break;
4072 case CCValAssign::Indirect: {
4073 if (isByVal) {
4074 // Memcpy the argument to a temporary stack slot to prevent
4075 // the caller from seeing any modifications the callee may make
4076 // as guaranteed by the `byval` attribute.
4077 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4078 Flags.getByValSize(),
4079 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4080 SDValue StackSlot =
4081 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4082 Chain =
4083 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4084 // From now on treat this as a regular pointer
4085 Arg = StackSlot;
4086 isByVal = false;
4087 } else {
4088 // Store the argument.
4089 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4090 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4091 Chain = DAG.getStore(
4092 Chain, dl, Arg, SpillSlot,
4093 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4094 Arg = SpillSlot;
4095 }
4096 break;
4097 }
4098 }
4099
4100 if (VA.needsCustom()) {
4101 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4102, __PRETTY_FUNCTION__))
4102 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4102, __PRETTY_FUNCTION__))
;
4103 // Split v64i1 value into two registers
4104 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4105 } else if (VA.isRegLoc()) {
4106 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4107 const TargetOptions &Options = DAG.getTarget().Options;
4108 if (Options.EmitCallSiteInfo)
4109 CSInfo.emplace_back(VA.getLocReg(), I);
4110 if (isVarArg && IsWin64) {
4111 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4112 // shadow reg if callee is a varargs function.
4113 Register ShadowReg;
4114 switch (VA.getLocReg()) {
4115 case X86::XMM0: ShadowReg = X86::RCX; break;
4116 case X86::XMM1: ShadowReg = X86::RDX; break;
4117 case X86::XMM2: ShadowReg = X86::R8; break;
4118 case X86::XMM3: ShadowReg = X86::R9; break;
4119 }
4120 if (ShadowReg)
4121 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4122 }
4123 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4124 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4124, __PRETTY_FUNCTION__))
;
4125 if (!StackPtr.getNode())
4126 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4127 getPointerTy(DAG.getDataLayout()));
4128 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4129 dl, DAG, VA, Flags, isByVal));
4130 }
4131 }
4132
4133 if (!MemOpChains.empty())
4134 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4135
4136 if (Subtarget.isPICStyleGOT()) {
4137 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4138 // GOT pointer (except regcall).
4139 if (!isTailCall) {
4140 // Indirect call with RegCall calling convertion may use up all the
4141 // general registers, so it is not suitable to bind EBX reister for
4142 // GOT address, just let register allocator handle it.
4143 if (CallConv != CallingConv::X86_RegCall)
4144 RegsToPass.push_back(std::make_pair(
4145 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4146 getPointerTy(DAG.getDataLayout()))));
4147 } else {
4148 // If we are tail calling and generating PIC/GOT style code load the
4149 // address of the callee into ECX. The value in ecx is used as target of
4150 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4151 // for tail calls on PIC/GOT architectures. Normally we would just put the
4152 // address of GOT into ebx and then call target@PLT. But for tail calls
4153 // ebx would be restored (since ebx is callee saved) before jumping to the
4154 // target@PLT.
4155
4156 // Note: The actual moving to ECX is done further down.
4157 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4158 if (G && !G->getGlobal()->hasLocalLinkage() &&
4159 G->getGlobal()->hasDefaultVisibility())
4160 Callee = LowerGlobalAddress(Callee, DAG);
4161 else if (isa<ExternalSymbolSDNode>(Callee))
4162 Callee = LowerExternalSymbol(Callee, DAG);
4163 }
4164 }
4165
4166 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4167 // From AMD64 ABI document:
4168 // For calls that may call functions that use varargs or stdargs
4169 // (prototype-less calls or calls to functions containing ellipsis (...) in
4170 // the declaration) %al is used as hidden argument to specify the number
4171 // of SSE registers used. The contents of %al do not need to match exactly
4172 // the number of registers, but must be an ubound on the number of SSE
4173 // registers used and is in the range 0 - 8 inclusive.
4174
4175 // Count the number of XMM registers allocated.
4176 static const MCPhysReg XMMArgRegs[] = {
4177 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4178 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4179 };
4180 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4181 assert((Subtarget.hasSSE1() || !NumXMMRegs)(((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4182, __PRETTY_FUNCTION__))
4182 && "SSE registers cannot be used when SSE is disabled")(((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4182, __PRETTY_FUNCTION__))
;
4183 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4184 DAG.getConstant(NumXMMRegs, dl,
4185 MVT::i8)));
4186 }
4187
4188 if (isVarArg && IsMustTail) {
4189 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4190 for (const auto &F : Forwards) {
4191 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4192 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4193 }
4194 }
4195
4196 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4197 // don't need this because the eligibility check rejects calls that require
4198 // shuffling arguments passed in memory.
4199 if (!IsSibcall && isTailCall) {
4200 // Force all the incoming stack arguments to be loaded from the stack
4201 // before any new outgoing arguments are stored to the stack, because the
4202 // outgoing stack slots may alias the incoming argument stack slots, and
4203 // the alias isn't otherwise explicit. This is slightly more conservative
4204 // than necessary, because it means that each store effectively depends
4205 // on every argument instead of just those arguments it would clobber.
4206 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4207
4208 SmallVector<SDValue, 8> MemOpChains2;
4209 SDValue FIN;
4210 int FI = 0;
4211 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4212 ++I, ++OutsIndex) {
4213 CCValAssign &VA = ArgLocs[I];
4214
4215 if (VA.isRegLoc()) {
4216 if (VA.needsCustom()) {
4217 assert((CallConv == CallingConv::X86_RegCall) &&(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"
) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4218, __PRETTY_FUNCTION__))
4218 "Expecting custom case only in regcall calling convention")(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"
) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4218, __PRETTY_FUNCTION__))
;
4219 // This means that we are in special case where one argument was
4220 // passed through two register locations - Skip the next location
4221 ++I;
4222 }
4223
4224 continue;
4225 }
4226
4227 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4227, __PRETTY_FUNCTION__))
;
4228 SDValue Arg = OutVals[OutsIndex];
4229 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4230 // Skip inalloca/preallocated arguments. They don't require any work.
4231 if (Flags.isInAlloca() || Flags.isPreallocated())
4232 continue;
4233 // Create frame index.
4234 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4235 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4236 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4237 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4238
4239 if (Flags.isByVal()) {
4240 // Copy relative to framepointer.
4241 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4242 if (!StackPtr.getNode())
4243 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4244 getPointerTy(DAG.getDataLayout()));
4245 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4246 StackPtr, Source);
4247
4248 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4249 ArgChain,
4250 Flags, DAG, dl));
4251 } else {
4252 // Store relative to framepointer.
4253 MemOpChains2.push_back(DAG.getStore(
4254 ArgChain, dl, Arg, FIN,
4255 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4256 }
4257 }
4258
4259 if (!MemOpChains2.empty())
4260 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4261
4262 // Store the return address to the appropriate stack slot.
4263 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4264 getPointerTy(DAG.getDataLayout()),
4265 RegInfo->getSlotSize(), FPDiff, dl);
4266 }
4267
4268 // Build a sequence of copy-to-reg nodes chained together with token chain
4269 // and flag operands which copy the outgoing args into registers.
4270 SDValue InFlag;
4271 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4272 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4273 RegsToPass[i].second, InFlag);
4274 InFlag = Chain.getValue(1);
4275 }
4276
4277 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4278 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")((Is64Bit && "Large code model is only legal in 64-bit mode."
) ? static_cast<void> (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4278, __PRETTY_FUNCTION__))
;
4279 // In the 64-bit large code model, we have to make all calls
4280 // through a register, since the call instruction's 32-bit
4281 // pc-relative offset may not be large enough to hold the whole
4282 // address.
4283 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4284 Callee->getOpcode() == ISD::ExternalSymbol) {
4285 // Lower direct calls to global addresses and external symbols. Setting
4286 // ForCall to true here has the effect of removing WrapperRIP when possible
4287 // to allow direct calls to be selected without first materializing the
4288 // address into a register.
4289 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4290 } else if (Subtarget.isTarget64BitILP32() &&
4291 Callee->getValueType(0) == MVT::i32) {
4292 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4293 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4294 }
4295
4296 // Returns a chain & a flag for retval copy to use.
4297 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4298 SmallVector<SDValue, 8> Ops;
4299
4300 if (!IsSibcall && isTailCall && !IsMustTail) {
4301 Chain = DAG.getCALLSEQ_END(Chain,
4302 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4303 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4304 InFlag = Chain.getValue(1);
4305 }
4306
4307 Ops.push_back(Chain);
4308 Ops.push_back(Callee);
4309
4310 if (isTailCall)
4311 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4312
4313 // Add argument registers to the end of the list so that they are known live
4314 // into the call.
4315 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4316 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4317 RegsToPass[i].second.getValueType()));
4318
4319 // Add a register mask operand representing the call-preserved registers.
4320 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
4321 // set X86_INTR calling convention because it has the same CSR mask
4322 // (same preserved registers).
4323 const uint32_t *Mask = RegInfo->getCallPreservedMask(
4324 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
4325 assert(Mask && "Missing call preserved mask for calling convention")((Mask && "Missing call preserved mask for calling convention"
) ? static_cast<void> (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4325, __PRETTY_FUNCTION__))
;
4326
4327 // If this is an invoke in a 32-bit function using a funclet-based
4328 // personality, assume the function clobbers all registers. If an exception
4329 // is thrown, the runtime will not restore CSRs.
4330 // FIXME: Model this more precisely so that we can register allocate across
4331 // the normal edge and spill and fill across the exceptional edge.
4332 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4333 const Function &CallerFn = MF.getFunction();
4334 EHPersonality Pers =
4335 CallerFn.hasPersonalityFn()
4336 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4337 : EHPersonality::Unknown;
4338 if (isFuncletEHPersonality(Pers))
4339 Mask = RegInfo->getNoPreservedMask();
4340 }
4341
4342 // Define a new register mask from the existing mask.
4343 uint32_t *RegMask = nullptr;
4344
4345 // In some calling conventions we need to remove the used physical registers
4346 // from the reg mask.
4347 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4348 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4349
4350 // Allocate a new Reg Mask and copy Mask.
4351 RegMask = MF.allocateRegMask();
4352 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4353 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4354
4355 // Make sure all sub registers of the argument registers are reset
4356 // in the RegMask.
4357 for (auto const &RegPair : RegsToPass)
4358 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4359 SubRegs.isValid(); ++SubRegs)
4360 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4361
4362 // Create the RegMask Operand according to our updated mask.
4363 Ops.push_back(DAG.getRegisterMask(RegMask));
4364 } else {
4365 // Create the RegMask Operand according to the static mask.
4366 Ops.push_back(DAG.getRegisterMask(Mask));
4367 }
4368
4369 if (InFlag.getNode())
4370 Ops.push_back(InFlag);
4371
4372 if (isTailCall) {
4373 // We used to do:
4374 //// If this is the first return lowered for this function, add the regs
4375 //// to the liveout set for the function.
4376 // This isn't right, although it's probably harmless on x86; liveouts
4377 // should be computed from returns not tail calls. Consider a void
4378 // function making a tail call to a function returning int.
4379 MF.getFrameInfo().setHasTailCall();
4380 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4381 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4382 return Ret;
4383 }
4384
4385 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4386 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4387 } else {
4388 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4389 }
4390 InFlag = Chain.getValue(1);
4391 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4392 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4393
4394 // Save heapallocsite metadata.
4395 if (CLI.CB)
4396 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4397 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4398
4399 // Create the CALLSEQ_END node.
4400 unsigned NumBytesForCalleeToPop;
4401 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4402 DAG.getTarget().Options.GuaranteedTailCallOpt))
4403 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4404 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4405 !Subtarget.getTargetTriple().isOSMSVCRT() &&
4406 SR == StackStructReturn)
4407 // If this is a call to a struct-return function, the callee
4408 // pops the hidden struct pointer, so we have to push it back.
4409 // This is common for Darwin/X86, Linux & Mingw32 targets.
4410 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4411 NumBytesForCalleeToPop = 4;
4412 else
4413 NumBytesForCalleeToPop = 0; // Callee pops nothing.
4414
4415 // Returns a flag for retval copy to use.
4416 if (!IsSibcall) {
4417 Chain = DAG.getCALLSEQ_END(Chain,
4418 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4419 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4420 true),
4421 InFlag, dl);
4422 InFlag = Chain.getValue(1);
4423 }
4424
4425 // Handle result values, copying them out of physregs into vregs that we
4426 // return.
4427 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4428 InVals, RegMask);
4429}
4430
4431//===----------------------------------------------------------------------===//
4432// Fast Calling Convention (tail call) implementation
4433//===----------------------------------------------------------------------===//
4434
4435// Like std call, callee cleans arguments, convention except that ECX is
4436// reserved for storing the tail called function address. Only 2 registers are
4437// free for argument passing (inreg). Tail call optimization is performed
4438// provided:
4439// * tailcallopt is enabled
4440// * caller/callee are fastcc
4441// On X86_64 architecture with GOT-style position independent code only local
4442// (within module) calls are supported at the moment.
4443// To keep the stack aligned according to platform abi the function
4444// GetAlignedArgumentStackSize ensures that argument delta is always multiples
4445// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4446// If a tail called function callee has more arguments than the caller the
4447// caller needs to make sure that there is room to move the RETADDR to. This is
4448// achieved by reserving an area the size of the argument delta right after the
4449// original RETADDR, but before the saved framepointer or the spilled registers
4450// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4451// stack layout:
4452// arg1
4453// arg2
4454// RETADDR
4455// [ new RETADDR
4456// move area ]
4457// (possible EBP)
4458// ESI
4459// EDI
4460// local1 ..
4461
4462/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4463/// requirement.
4464unsigned
4465X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4466 SelectionDAG &DAG) const {
4467 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4468 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4469 assert(StackSize % SlotSize == 0 &&((StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"
) ? static_cast<void> (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4470, __PRETTY_FUNCTION__))
4470 "StackSize must be a multiple of SlotSize")((StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"
) ? static_cast<void> (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4470, __PRETTY_FUNCTION__))
;
4471 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4472}
4473
4474/// Return true if the given stack call argument is already available in the
4475/// same position (relatively) of the caller's incoming argument stack.
4476static
4477bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4478 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4479 const X86InstrInfo *TII, const CCValAssign &VA) {
4480 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4481
4482 for (;;) {
4483 // Look through nodes that don't alter the bits of the incoming value.
4484 unsigned Op = Arg.getOpcode();
4485 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4486 Arg = Arg.getOperand(0);
4487 continue;
4488 }
4489 if (Op == ISD::TRUNCATE) {
4490 const SDValue &TruncInput = Arg.getOperand(0);
4491 if (TruncInput.getOpcode() == ISD::AssertZext &&
4492 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4493 Arg.getValueType()) {
4494 Arg = TruncInput.getOperand(0);
4495 continue;
4496 }
4497 }
4498 break;
4499 }
4500
4501 int FI = INT_MAX2147483647;
4502 if (Arg.getOpcode() == ISD::CopyFromReg) {
4503 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4504 if (!VR.isVirtual())
4505 return false;
4506 MachineInstr *Def = MRI->getVRegDef(VR);
4507 if (!Def)
4508 return false;
4509 if (!Flags.isByVal()) {
4510 if (!TII->isLoadFromStackSlot(*Def, FI))
4511 return false;
4512 } else {
4513 unsigned Opcode = Def->getOpcode();
4514 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4515 Opcode == X86::LEA64_32r) &&
4516 Def->getOperand(1).isFI()) {
4517 FI = Def->getOperand(1).getIndex();
4518 Bytes = Flags.getByValSize();
4519 } else
4520 return false;
4521 }
4522 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4523 if (Flags.isByVal())
4524 // ByVal argument is passed in as a pointer but it's now being
4525 // dereferenced. e.g.
4526 // define @foo(%struct.X* %A) {
4527 // tail call @bar(%struct.X* byval %A)
4528 // }
4529 return false;
4530 SDValue Ptr = Ld->getBasePtr();
4531 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4532 if (!FINode)
4533 return false;
4534 FI = FINode->getIndex();
4535 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4536 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4537 FI = FINode->getIndex();
4538 Bytes = Flags.getByValSize();
4539 } else
4540 return false;
4541
4542 assert(FI != INT_MAX)((FI != 2147483647) ? static_cast<void> (0) : __assert_fail
("FI != INT_MAX", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4542, __PRETTY_FUNCTION__))
;
4543 if (!MFI.isFixedObjectIndex(FI))
4544 return false;
4545
4546 if (Offset != MFI.getObjectOffset(FI))
4547 return false;
4548
4549 // If this is not byval, check that the argument stack object is immutable.
4550 // inalloca and argument copy elision can create mutable argument stack
4551 // objects. Byval objects can be mutated, but a byval call intends to pass the
4552 // mutated memory.
4553 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4554 return false;
4555
4556 if (VA.getLocVT().getFixedSizeInBits() >
4557 Arg.getValueSizeInBits().getFixedSize()) {
4558 // If the argument location is wider than the argument type, check that any
4559 // extension flags match.
4560 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4561 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4562 return false;
4563 }
4564 }
4565
4566 return Bytes == MFI.getObjectSize(FI);
4567}
4568
4569/// Check whether the call is eligible for tail call optimization. Targets
4570/// that want to do tail call optimization should implement this function.
4571bool X86TargetLowering::IsEligibleForTailCallOptimization(
4572 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4573 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4574 const SmallVectorImpl<ISD::OutputArg> &Outs,
4575 const SmallVectorImpl<SDValue> &OutVals,
4576 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4577 if (!mayTailCallThisCC(CalleeCC))
4578 return false;
4579
4580 // If -tailcallopt is specified, make fastcc functions tail-callable.
4581 MachineFunction &MF = DAG.getMachineFunction();
4582 const Function &CallerF = MF.getFunction();
4583
4584 // If the function return type is x86_fp80 and the callee return type is not,
4585 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4586 // perform a tailcall optimization here.
4587 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4588 return false;
4589
4590 CallingConv::ID CallerCC = CallerF.getCallingConv();
4591 bool CCMatch = CallerCC == CalleeCC;
4592 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4593 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4594 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4595 CalleeCC == CallingConv::Tail;
4596
4597 // Win64 functions have extra shadow space for argument homing. Don't do the
4598 // sibcall if the caller and callee have mismatched expectations for this
4599 // space.
4600 if (IsCalleeWin64 != IsCallerWin64)
4601 return false;
4602
4603 if (IsGuaranteeTCO) {
4604 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4605 return true;
4606 return false;
4607 }
4608
4609 // Look for obvious safe cases to perform tail call optimization that do not
4610 // require ABI changes. This is what gcc calls sibcall.
4611
4612 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4613 // emit a special epilogue.
4614 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4615 if (RegInfo->needsStackRealignment(MF))
4616 return false;
4617
4618 // Also avoid sibcall optimization if either caller or callee uses struct
4619 // return semantics.
4620 if (isCalleeStructRet || isCallerStructRet)
4621 return false;
4622
4623 // Do not sibcall optimize vararg calls unless all arguments are passed via
4624 // registers.
4625 LLVMContext &C = *DAG.getContext();
4626 if (isVarArg && !Outs.empty()) {
4627 // Optimizing for varargs on Win64 is unlikely to be safe without
4628 // additional testing.
4629 if (IsCalleeWin64 || IsCallerWin64)
4630 return false;
4631
4632 SmallVector<CCValAssign, 16> ArgLocs;
4633 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4634
4635 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4636 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4637 if (!ArgLocs[i].isRegLoc())
4638 return false;
4639 }
4640
4641 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4642 // stack. Therefore, if it's not used by the call it is not safe to optimize
4643 // this into a sibcall.
4644 bool Unused = false;
4645 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4646 if (!Ins[i].Used) {
4647 Unused = true;
4648 break;
4649 }
4650 }
4651 if (Unused) {
4652 SmallVector<CCValAssign, 16> RVLocs;
4653 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4654 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4655 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4656 CCValAssign &VA = RVLocs[i];
4657 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4658 return false;
4659 }
4660 }
4661
4662 // Check that the call results are passed in the same way.
4663 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4664 RetCC_X86, RetCC_X86))
4665 return false;
4666 // The callee has to preserve all registers the caller needs to preserve.
4667 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4668 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4669 if (!CCMatch) {
4670 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4671 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4672 return false;
4673 }
4674
4675 unsigned StackArgsSize = 0;
4676
4677 // If the callee takes no arguments then go on to check the results of the
4678 // call.
4679 if (!Outs.empty()) {
4680 // Check if stack adjustment is needed. For now, do not do this if any
4681 // argument is passed on the stack.
4682 SmallVector<CCValAssign, 16> ArgLocs;
4683 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4684
4685 // Allocate shadow area for Win64
4686 if (IsCalleeWin64)
4687 CCInfo.AllocateStack(32, Align(8));
4688
4689 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4690 StackArgsSize = CCInfo.getNextStackOffset();
4691
4692 if (CCInfo.getNextStackOffset()) {
4693 // Check if the arguments are already laid out in the right way as
4694 // the caller's fixed stack objects.
4695 MachineFrameInfo &MFI = MF.getFrameInfo();
4696 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4697 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4698 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4699 CCValAssign &VA = ArgLocs[i];
4700 SDValue Arg = OutVals[i];
4701 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4702 if (VA.getLocInfo() == CCValAssign::Indirect)
4703 return false;
4704 if (!VA.isRegLoc()) {
4705 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4706 MFI, MRI, TII, VA))
4707 return false;
4708 }
4709 }
4710 }
4711
4712 bool PositionIndependent = isPositionIndependent();
4713 // If the tailcall address may be in a register, then make sure it's
4714 // possible to register allocate for it. In 32-bit, the call address can
4715 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4716 // callee-saved registers are restored. These happen to be the same
4717 // registers used to pass 'inreg' arguments so watch out for those.
4718 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4719 !isa<ExternalSymbolSDNode>(Callee)) ||
4720 PositionIndependent)) {
4721 unsigned NumInRegs = 0;
4722 // In PIC we need an extra register to formulate the address computation
4723 // for the callee.
4724 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4725
4726 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4727 CCValAssign &VA = ArgLocs[i];
4728 if (!VA.isRegLoc())
4729 continue;
4730 Register Reg = VA.getLocReg();
4731 switch (Reg) {
4732 default: break;
4733 case X86::EAX: case X86::EDX: case X86::ECX:
4734 if (++NumInRegs == MaxInRegs)
4735 return false;
4736 break;
4737 }
4738 }
4739 }
4740
4741 const MachineRegisterInfo &MRI = MF.getRegInfo();
4742 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4743 return false;
4744 }
4745
4746 bool CalleeWillPop =
4747 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4748 MF.getTarget().Options.GuaranteedTailCallOpt);
4749
4750 if (unsigned BytesToPop =
4751 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4752 // If we have bytes to pop, the callee must pop them.
4753 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4754 if (!CalleePopMatches)
4755 return false;
4756 } else if (CalleeWillPop && StackArgsSize > 0) {
4757 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4758 return false;
4759 }
4760
4761 return true;
4762}
4763
4764FastISel *
4765X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4766 const TargetLibraryInfo *libInfo) const {
4767 return X86::createFastISel(funcInfo, libInfo);
4768}
4769
4770//===----------------------------------------------------------------------===//
4771// Other Lowering Hooks
4772//===----------------------------------------------------------------------===//
4773
4774static bool MayFoldLoad(SDValue Op) {
4775 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4776}
4777
4778static bool MayFoldIntoStore(SDValue Op) {
4779 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4780}
4781
4782static bool MayFoldIntoZeroExtend(SDValue Op) {
4783 if (Op.hasOneUse()) {
4784 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4785 return (ISD::ZERO_EXTEND == Opcode);
4786 }
4787 return false;
4788}
4789
4790static bool isTargetShuffle(unsigned Opcode) {
4791 switch(Opcode) {
4792 default: return false;
4793 case X86ISD::BLENDI:
4794 case X86ISD::PSHUFB:
4795 case X86ISD::PSHUFD:
4796 case X86ISD::PSHUFHW:
4797 case X86ISD::PSHUFLW:
4798 case X86ISD::SHUFP:
4799 case X86ISD::INSERTPS:
4800 case X86ISD::EXTRQI:
4801 case X86ISD::INSERTQI:
4802 case X86ISD::VALIGN:
4803 case X86ISD::PALIGNR:
4804 case X86ISD::VSHLDQ:
4805 case X86ISD::VSRLDQ:
4806 case X86ISD::MOVLHPS:
4807 case X86ISD::MOVHLPS:
4808 case X86ISD::MOVSHDUP:
4809 case X86ISD::MOVSLDUP:
4810 case X86ISD::MOVDDUP:
4811 case X86ISD::MOVSS:
4812 case X86ISD::MOVSD:
4813 case X86ISD::UNPCKL:
4814 case X86ISD::UNPCKH:
4815 case X86ISD::VBROADCAST:
4816 case X86ISD::VPERMILPI:
4817 case X86ISD::VPERMILPV:
4818 case X86ISD::VPERM2X128:
4819 case X86ISD::SHUF128:
4820 case X86ISD::VPERMIL2:
4821 case X86ISD::VPERMI:
4822 case X86ISD::VPPERM:
4823 case X86ISD::VPERMV:
4824 case X86ISD::VPERMV3:
4825 case X86ISD::VZEXT_MOVL:
4826 return true;
4827 }
4828}
4829
4830static bool isTargetShuffleVariableMask(unsigned Opcode) {
4831 switch (Opcode) {
4832 default: return false;
4833 // Target Shuffles.
4834 case X86ISD::PSHUFB:
4835 case X86ISD::VPERMILPV:
4836 case X86ISD::VPERMIL2:
4837 case X86ISD::VPPERM:
4838 case X86ISD::VPERMV:
4839 case X86ISD::VPERMV3:
4840 return true;
4841 // 'Faux' Target Shuffles.
4842 case ISD::OR:
4843 case ISD::AND:
4844 case X86ISD::ANDNP:
4845 return true;
4846 }
4847}
4848
4849static bool isTargetShuffleSplat(SDValue Op) {
4850 unsigned Opcode = Op.getOpcode();
4851 if (Opcode == ISD::EXTRACT_SUBVECTOR)
4852 return isTargetShuffleSplat(Op.getOperand(0));
4853 return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
4854}
4855
4856SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4857 MachineFunction &MF = DAG.getMachineFunction();
4858 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4859 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4860 int ReturnAddrIndex = FuncInfo->getRAIndex();
4861
4862 if (ReturnAddrIndex == 0) {
4863 // Set up a frame object for the return address.
4864 unsigned SlotSize = RegInfo->getSlotSize();
4865 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4866 -(int64_t)SlotSize,
4867 false);
4868 FuncInfo->setRAIndex(ReturnAddrIndex);
4869 }
4870
4871 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4872}
4873
4874bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4875 bool hasSymbolicDisplacement) {
4876 // Offset should fit into 32 bit immediate field.
4877 if (!isInt<32>(Offset))
4878 return false;
4879
4880 // If we don't have a symbolic displacement - we don't have any extra
4881 // restrictions.
4882 if (!hasSymbolicDisplacement)
4883 return true;
4884
4885 // FIXME: Some tweaks might be needed for medium code model.
4886 if (M != CodeModel::Small && M != CodeModel::Kernel)
4887 return false;
4888
4889 // For small code model we assume that latest object is 16MB before end of 31
4890 // bits boundary. We may also accept pretty large negative constants knowing
4891 // that all objects are in the positive half of address space.
4892 if (M == CodeModel::Small && Offset < 16*1024*1024)
4893 return true;
4894
4895 // For kernel code model we know that all object resist in the negative half
4896 // of 32bits address space. We may not accept negative offsets, since they may
4897 // be just off and we may accept pretty large positive ones.
4898 if (M == CodeModel::Kernel && Offset >= 0)
4899 return true;
4900
4901 return false;
4902}
4903
4904/// Determines whether the callee is required to pop its own arguments.
4905/// Callee pop is necessary to support tail calls.
4906bool X86::isCalleePop(CallingConv::ID CallingConv,
4907 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4908 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4909 // can guarantee TCO.
4910 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4911 return true;
4912
4913 switch (CallingConv) {
4914 default:
4915 return false;
4916 case CallingConv::X86_StdCall:
4917 case CallingConv::X86_FastCall:
4918 case CallingConv::X86_ThisCall:
4919 case CallingConv::X86_VectorCall:
4920 return !is64Bit;
4921 }
4922}
4923
4924/// Return true if the condition is an signed comparison operation.
4925static bool isX86CCSigned(unsigned X86CC) {
4926 switch (X86CC) {
4927 default:
4928 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4928)
;
4929 case X86::COND_E:
4930 case X86::COND_NE:
4931 case X86::COND_B:
4932 case X86::COND_A:
4933 case X86::COND_BE:
4934 case X86::COND_AE:
4935 return false;
4936 case X86::COND_G:
4937 case X86::COND_GE:
4938 case X86::COND_L:
4939 case X86::COND_LE:
4940 return true;
4941 }
4942}
4943
4944static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4945 switch (SetCCOpcode) {
4946 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4946)
;
4947 case ISD::SETEQ: return X86::COND_E;
4948 case ISD::SETGT: return X86::COND_G;
4949 case ISD::SETGE: return X86::COND_GE;
4950 case ISD::SETLT: return X86::COND_L;
4951 case ISD::SETLE: return X86::COND_LE;
4952 case ISD::SETNE: return X86::COND_NE;
4953 case ISD::SETULT: return X86::COND_B;
4954 case ISD::SETUGT: return X86::COND_A;
4955 case ISD::SETULE: return X86::COND_BE;
4956 case ISD::SETUGE: return X86::COND_AE;
4957 }
4958}
4959
4960/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4961/// condition code, returning the condition code and the LHS/RHS of the
4962/// comparison to make.
4963static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4964 bool isFP, SDValue &LHS, SDValue &RHS,
4965 SelectionDAG &DAG) {
4966 if (!isFP) {
4967 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4968 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4969 // X > -1 -> X == 0, jump !sign.
4970 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4971 return X86::COND_NS;
4972 }
4973 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4974 // X < 0 -> X == 0, jump on sign.
4975 return X86::COND_S;
4976 }
4977 if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
4978 // X >= 0 -> X == 0, jump on !sign.
4979 return X86::COND_NS;
4980 }
4981 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
4982 // X < 1 -> X <= 0
4983 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4984 return X86::COND_LE;
4985 }
4986 }
4987
4988 return TranslateIntegerX86CC(SetCCOpcode);
4989 }
4990
4991 // First determine if it is required or is profitable to flip the operands.
4992
4993 // If LHS is a foldable load, but RHS is not, flip the condition.
4994 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4995 !ISD::isNON_EXTLoad(RHS.getNode())) {
4996 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4997 std::swap(LHS, RHS);
4998 }
4999
5000 switch (SetCCOpcode) {
5001 default: break;
5002 case ISD::SETOLT:
5003 case ISD::SETOLE:
5004 case ISD::SETUGT:
5005 case ISD::SETUGE:
5006 std::swap(LHS, RHS);
5007 break;
5008 }
5009
5010 // On a floating point condition, the flags are set as follows:
5011 // ZF PF CF op
5012 // 0 | 0 | 0 | X > Y
5013 // 0 | 0 | 1 | X < Y
5014 // 1 | 0 | 0 | X == Y
5015 // 1 | 1 | 1 | unordered
5016 switch (SetCCOpcode) {
5017 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5017)
;
5018 case ISD::SETUEQ:
5019 case ISD::SETEQ: return X86::COND_E;
5020 case ISD::SETOLT: // flipped
5021 case ISD::SETOGT:
5022 case ISD::SETGT: return X86::COND_A;
5023 case ISD::SETOLE: // flipped
5024 case ISD::SETOGE:
5025 case ISD::SETGE: return X86::COND_AE;
5026 case ISD::SETUGT: // flipped
5027 case ISD::SETULT:
5028 case ISD::SETLT: return X86::COND_B;
5029 case ISD::SETUGE: // flipped
5030 case ISD::SETULE:
5031 case ISD::SETLE: return X86::COND_BE;
5032 case ISD::SETONE:
5033 case ISD::SETNE: return X86::COND_NE;
5034 case ISD::SETUO: return X86::COND_P;
5035 case ISD::SETO: return X86::COND_NP;
5036 case ISD::SETOEQ:
5037 case ISD::SETUNE: return X86::COND_INVALID;
5038 }
5039}
5040
5041/// Is there a floating point cmov for the specific X86 condition code?
5042/// Current x86 isa includes the following FP cmov instructions:
5043/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5044static bool hasFPCMov(unsigned X86CC) {
5045 switch (X86CC) {
5046 default:
5047 return false;
5048 case X86::COND_B:
5049 case X86::COND_BE:
5050 case X86::COND_E:
5051 case X86::COND_P:
5052 case X86::COND_A:
5053 case X86::COND_AE:
5054 case X86::COND_NE:
5055 case X86::COND_NP:
5056 return true;
5057 }
5058}
5059
5060
5061bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5062 const CallInst &I,
5063 MachineFunction &MF,
5064 unsigned Intrinsic) const {
5065 Info.flags = MachineMemOperand::MONone;
5066 Info.offset = 0;
5067
5068 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5069 if (!IntrData) {
5070 switch (Intrinsic) {
5071 case Intrinsic::x86_aesenc128kl:
5072 case Intrinsic::x86_aesdec128kl:
5073 Info.opc = ISD::INTRINSIC_W_CHAIN;
5074 Info.ptrVal = I.getArgOperand(1);
5075 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5076 Info.align = Align(1);
5077 Info.flags |= MachineMemOperand::MOLoad;
5078 return true;
5079 case Intrinsic::x86_aesenc256kl:
5080 case Intrinsic::x86_aesdec256kl:
5081 Info.opc = ISD::INTRINSIC_W_CHAIN;
5082 Info.ptrVal = I.getArgOperand(1);
5083 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5084 Info.align = Align(1);
5085 Info.flags |= MachineMemOperand::MOLoad;
5086 return true;
5087 case Intrinsic::x86_aesencwide128kl:
5088 case Intrinsic::x86_aesdecwide128kl:
5089 Info.opc = ISD::INTRINSIC_W_CHAIN;
5090 Info.ptrVal = I.getArgOperand(0);
5091 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5092 Info.align = Align(1);
5093 Info.flags |= MachineMemOperand::MOLoad;
5094 return true;
5095 case Intrinsic::x86_aesencwide256kl:
5096 case Intrinsic::x86_aesdecwide256kl:
5097 Info.opc = ISD::INTRINSIC_W_CHAIN;
5098 Info.ptrVal = I.getArgOperand(0);
5099 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5100 Info.align = Align(1);
5101 Info.flags |= MachineMemOperand::MOLoad;
5102 return true;
5103 }
5104 return false;
5105 }
5106
5107 switch (IntrData->Type) {
5108 case TRUNCATE_TO_MEM_VI8:
5109 case TRUNCATE_TO_MEM_VI16:
5110 case TRUNCATE_TO_MEM_VI32: {
5111 Info.opc = ISD::INTRINSIC_VOID;
5112 Info.ptrVal = I.getArgOperand(0);
5113 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5114 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5115 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5116 ScalarVT = MVT::i8;
5117 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5118 ScalarVT = MVT::i16;
5119 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5120 ScalarVT = MVT::i32;
5121
5122 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5123 Info.align = Align(1);
5124 Info.flags |= MachineMemOperand::MOStore;
5125 break;
5126 }
5127 case GATHER:
5128 case GATHER_AVX2: {
5129 Info.opc = ISD::INTRINSIC_W_CHAIN;
5130 Info.ptrVal = nullptr;
5131 MVT DataVT = MVT::getVT(I.getType());
5132 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5133 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5134 IndexVT.getVectorNumElements());
5135 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5136 Info.align = Align(1);
5137 Info.flags |= MachineMemOperand::MOLoad;
5138 break;
5139 }
5140 case SCATTER: {
5141 Info.opc = ISD::INTRINSIC_VOID;
5142 Info.ptrVal = nullptr;
5143 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5144 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5145 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5146 IndexVT.getVectorNumElements());
5147 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5148 Info.align = Align(1);
5149 Info.flags |= MachineMemOperand::MOStore;
5150 break;
5151 }
5152 default:
5153 return false;
5154 }
5155
5156 return true;
5157}
5158
5159/// Returns true if the target can instruction select the
5160/// specified FP immediate natively. If false, the legalizer will
5161/// materialize the FP immediate as a load from a constant pool.
5162bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5163 bool ForCodeSize) const {
5164 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5165 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5166 return true;
5167 }
5168 return false;
5169}
5170
5171bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5172 ISD::LoadExtType ExtTy,
5173 EVT NewVT) const {
5174 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")((cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow"
) ? static_cast<void> (0) : __assert_fail ("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5174, __PRETTY_FUNCTION__))
;
5175
5176 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5177 // relocation target a movq or addq instruction: don't let the load shrink.
5178 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5179 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5180 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5181 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5182
5183 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5184 // those uses are extracted directly into a store, then the extract + store
5185 // can be store-folded. Therefore, it's probably not worth splitting the load.
5186 EVT VT = Load->getValueType(0);
5187 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5188 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5189 // Skip uses of the chain value. Result 0 of the node is the load value.
5190 if (UI.getUse().getResNo() != 0)
5191 continue;
5192
5193 // If this use is not an extract + store, it's probably worth splitting.
5194 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5195 UI->use_begin()->getOpcode() != ISD::STORE)
5196 return true;
5197 }
5198 // All non-chain uses are extract + store.
5199 return false;
5200 }
5201
5202 return true;
5203}
5204
5205/// Returns true if it is beneficial to convert a load of a constant
5206/// to just the constant itself.
5207bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5208 Type *Ty) const {
5209 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5209, __PRETTY_FUNCTION__))
;
5210
5211 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5212 if (BitSize == 0 || BitSize > 64)
5213 return false;
5214 return true;
5215}
5216
5217bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5218 // If we are using XMM registers in the ABI and the condition of the select is
5219 // a floating-point compare and we have blendv or conditional move, then it is
5220 // cheaper to select instead of doing a cross-register move and creating a
5221 // load that depends on the compare result.
5222 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5223 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5224}
5225
5226bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5227 // TODO: It might be a win to ease or lift this restriction, but the generic
5228 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5229 if (VT.isVector() && Subtarget.hasAVX512())
5230 return false;
5231
5232 return true;
5233}
5234
5235bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5236 SDValue C) const {
5237 // TODO: We handle scalars using custom code, but generic combining could make
5238 // that unnecessary.
5239 APInt MulC;
5240 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5241 return false;
5242
5243 // Find the type this will be legalized too. Otherwise we might prematurely
5244 // convert this to shl+add/sub and then still have to type legalize those ops.
5245 // Another choice would be to defer the decision for illegal types until
5246 // after type legalization. But constant splat vectors of i64 can't make it
5247 // through type legalization on 32-bit targets so we would need to special
5248 // case vXi64.
5249 while (getTypeAction(Context, VT) != TypeLegal)
5250 VT = getTypeToTransformTo(Context, VT);
5251
5252 // If vector multiply is legal, assume that's faster than shl + add/sub.
5253 // TODO: Multiply is a complex op with higher latency and lower throughput in
5254 // most implementations, so this check could be loosened based on type
5255 // and/or a CPU attribute.
5256 if (isOperationLegal(ISD::MUL, VT))
5257 return false;
5258
5259 // shl+add, shl+sub, shl+add+neg
5260 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5261 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5262}
5263
5264bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5265 unsigned Index) const {
5266 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5267 return false;
5268
5269 // Mask vectors support all subregister combinations and operations that
5270 // extract half of vector.
5271 if (ResVT.getVectorElementType() == MVT::i1)
5272 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5273 (Index == ResVT.getVectorNumElements()));
5274
5275 return (Index % ResVT.getVectorNumElements()) == 0;
5276}
5277
5278bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5279 unsigned Opc = VecOp.getOpcode();
5280
5281 // Assume target opcodes can't be scalarized.
5282 // TODO - do we have any exceptions?
5283 if (Opc >= ISD::BUILTIN_OP_END)
5284 return false;
5285
5286 // If the vector op is not supported, try to convert to scalar.
5287 EVT VecVT = VecOp.getValueType();
5288 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5289 return true;
5290
5291 // If the vector op is supported, but the scalar op is not, the transform may
5292 // not be worthwhile.
5293 EVT ScalarVT = VecVT.getScalarType();
5294 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5295}
5296
5297bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5298 bool) const {
5299 // TODO: Allow vectors?
5300 if (VT.isVector())
5301 return false;
5302 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5303}
5304
5305bool X86TargetLowering::isCheapToSpeculateCttz() const {
5306 // Speculate cttz only if we can directly use TZCNT.
5307 return Subtarget.hasBMI();
5308}
5309
5310bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5311 // Speculate ctlz only if we can directly use LZCNT.
5312 return Subtarget.hasLZCNT();
5313}
5314
5315bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5316 const SelectionDAG &DAG,
5317 const MachineMemOperand &MMO) const {
5318 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5319 BitcastVT.getVectorElementType() == MVT::i1)
5320 return false;
5321
5322 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5323 return false;
5324
5325 // If both types are legal vectors, it's always ok to convert them.
5326 if (LoadVT.isVector() && BitcastVT.isVector() &&
5327 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5328 return true;
5329
5330 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5331}
5332
5333bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5334 const SelectionDAG &DAG) const {
5335 // Do not merge to float value size (128 bytes) if no implicit
5336 // float attribute is set.
5337 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
5338 Attribute::NoImplicitFloat);
5339
5340 if (NoFloat) {
5341 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5342 return (MemVT.getSizeInBits() <= MaxIntSize);
5343 }
5344 // Make sure we don't merge greater than our preferred vector
5345 // width.
5346 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5347 return false;
5348
5349 // Don't merge to x86 amx tile, as we only map MVT::v256i32
5350 // to x86 amx tile on amx intrinsics.
5351 if (MemVT == MVT::v256i32)
5352 return false;
5353
5354 return true;
5355}
5356
5357bool X86TargetLowering::isCtlzFast() const {
5358 return Subtarget.hasFastLZCNT();
5359}
5360
5361bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5362 const Instruction &AndI) const {
5363 return true;
5364}
5365
5366bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5367 EVT VT = Y.getValueType();
5368
5369 if (VT.isVector())
5370 return false;
5371
5372 if (!Subtarget.hasBMI())
5373 return false;
5374
5375 // There are only 32-bit and 64-bit forms for 'andn'.
5376 if (VT != MVT::i32 && VT != MVT::i64)
5377 return false;
5378
5379 return !isa<ConstantSDNode>(Y);
5380}
5381
5382bool X86TargetLowering::hasAndNot(SDValue Y) const {
5383 EVT VT = Y.getValueType();
5384
5385 if (!VT.isVector())
5386 return hasAndNotCompare(Y);
5387
5388 // Vector.
5389
5390 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5391 return false;
5392
5393 if (VT == MVT::v4i32)
5394 return true;
5395
5396 return Subtarget.hasSSE2();
5397}
5398
5399bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5400 return X.getValueType().isScalarInteger(); // 'bt'
5401}
5402
5403bool X86TargetLowering::
5404 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5405 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5406 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5407 SelectionDAG &DAG) const {
5408 // Does baseline recommend not to perform the fold by default?
5409 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5410 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5411 return false;
5412 // For scalars this transform is always beneficial.
5413 if (X.getValueType().isScalarInteger())
5414 return true;
5415 // If all the shift amounts are identical, then transform is beneficial even
5416 // with rudimentary SSE2 shifts.
5417 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5418 return true;
5419 // If we have AVX2 with it's powerful shift operations, then it's also good.
5420 if (Subtarget.hasAVX2())
5421 return true;
5422 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5423 return NewShiftOpcode == ISD::SHL;
5424}
5425
5426bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5427 const SDNode *N, CombineLevel Level) const {
5428 assert(((N->getOpcode() == ISD::SHL &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5432, __PRETTY_FUNCTION__))
5429 N->getOperand(0).getOpcode() == ISD::SRL) ||((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5432, __PRETTY_FUNCTION__))
5430 (N->getOpcode() == ISD::SRL &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5432, __PRETTY_FUNCTION__))
5431 N->getOperand(0).getOpcode() == ISD::SHL)) &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5432, __PRETTY_FUNCTION__))
5432 "Expected shift-shift mask")((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5432, __PRETTY_FUNCTION__))
;
5433 EVT VT = N->getValueType(0);
5434 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5435 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5436 // Only fold if the shift values are equal - so it folds to AND.
5437 // TODO - we should fold if either is a non-uniform vector but we don't do
5438 // the fold for non-splats yet.
5439 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5440 }
5441 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5442}
5443
5444bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5445 EVT VT = Y.getValueType();
5446
5447 // For vectors, we don't have a preference, but we probably want a mask.
5448 if (VT.isVector())
5449 return false;
5450
5451 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5452 if (VT == MVT::i64 && !Subtarget.is64Bit())
5453 return false;
5454
5455 return true;
5456}
5457
5458bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5459 SDNode *N) const {
5460 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5461 !Subtarget.isOSWindows())
5462 return false;
5463 return true;
5464}
5465
5466bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5467 // Any legal vector type can be splatted more efficiently than
5468 // loading/spilling from memory.
5469 return isTypeLegal(VT);
5470}
5471
5472MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5473 MVT VT = MVT::getIntegerVT(NumBits);
5474 if (isTypeLegal(VT))
5475 return VT;
5476
5477 // PMOVMSKB can handle this.
5478 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5479 return MVT::v16i8;
5480
5481 // VPMOVMSKB can handle this.
5482 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5483 return MVT::v32i8;
5484
5485 // TODO: Allow 64-bit type for 32-bit target.
5486 // TODO: 512-bit types should be allowed, but make sure that those
5487 // cases are handled in combineVectorSizedSetCCEquality().
5488
5489 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5490}
5491
5492/// Val is the undef sentinel value or equal to the specified value.
5493static bool isUndefOrEqual(int Val, int CmpVal) {
5494 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5495}
5496
5497/// Return true if every element in Mask is the undef sentinel value or equal to
5498/// the specified value..
5499static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5500 return llvm::all_of(Mask, [CmpVal](int M) {
5501 return (M == SM_SentinelUndef) || (M == CmpVal);
5502 });
5503}
5504
5505/// Val is either the undef or zero sentinel value.
5506static bool isUndefOrZero(int Val) {
5507 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5508}
5509
5510/// Return true if every element in Mask, beginning from position Pos and ending
5511/// in Pos+Size is the undef sentinel value.
5512static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5513 return llvm::all_of(Mask.slice(Pos, Size),
5514 [](int M) { return M == SM_SentinelUndef; });
5515}
5516
5517/// Return true if the mask creates a vector whose lower half is undefined.
5518static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5519 unsigned NumElts = Mask.size();
5520 return isUndefInRange(Mask, 0, NumElts / 2);
5521}
5522
5523/// Return true if the mask creates a vector whose upper half is undefined.
5524static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5525 unsigned NumElts = Mask.size();
5526 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5527}
5528
5529/// Return true if Val falls within the specified range (L, H].
5530static bool isInRange(int Val, int Low, int Hi) {
5531 return (Val >= Low && Val < Hi);
5532}
5533
5534/// Return true if the value of any element in Mask falls within the specified
5535/// range (L, H].
5536static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5537 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5538}
5539
5540/// Return true if the value of any element in Mask is the zero sentinel value.
5541static bool isAnyZero(ArrayRef<int> Mask) {
5542 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5543}
5544
5545/// Return true if the value of any element in Mask is the zero or undef
5546/// sentinel values.
5547static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5548 return llvm::any_of(Mask, [](int M) {
5549 return M == SM_SentinelZero || M == SM_SentinelUndef;
5550 });
5551}
5552
5553/// Return true if Val is undef or if its value falls within the
5554/// specified range (L, H].
5555static bool isUndefOrInRange(int Val, int Low, int Hi) {
5556 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5557}
5558
5559/// Return true if every element in Mask is undef or if its value
5560/// falls within the specified range (L, H].
5561static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5562 return llvm::all_of(
5563 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5564}
5565
5566/// Return true if Val is undef, zero or if its value falls within the
5567/// specified range (L, H].
5568static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5569 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5570}
5571
5572/// Return true if every element in Mask is undef, zero or if its value
5573/// falls within the specified range (L, H].
5574static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5575 return llvm::all_of(
5576 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5577}
5578
5579/// Return true if every element in Mask, beginning
5580/// from position Pos and ending in Pos + Size, falls within the specified
5581/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5582static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5583 unsigned Size, int Low, int Step = 1) {
5584 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5585 if (!isUndefOrEqual(Mask[i], Low))
5586 return false;
5587 return true;
5588}
5589
5590/// Return true if every element in Mask, beginning
5591/// from position Pos and ending in Pos+Size, falls within the specified
5592/// sequential range (Low, Low+Size], or is undef or is zero.
5593static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5594 unsigned Size, int Low,
5595 int Step = 1) {
5596 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5597 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5598 return false;
5599 return true;
5600}
5601
5602/// Return true if every element in Mask, beginning
5603/// from position Pos and ending in Pos+Size is undef or is zero.
5604static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5605 unsigned Size) {
5606 return llvm::all_of(Mask.slice(Pos, Size),
5607 [](int M) { return isUndefOrZero(M); });
5608}
5609
5610/// Helper function to test whether a shuffle mask could be
5611/// simplified by widening the elements being shuffled.
5612///
5613/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5614/// leaves it in an unspecified state.
5615///
5616/// NOTE: This must handle normal vector shuffle masks and *target* vector
5617/// shuffle masks. The latter have the special property of a '-2' representing
5618/// a zero-ed lane of a vector.
5619static bool canWidenShuffleElements(ArrayRef<int> Mask,
5620 SmallVectorImpl<int> &WidenedMask) {
5621 WidenedMask.assign(Mask.size() / 2, 0);
5622 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5623 int M0 = Mask[i];
5624 int M1 = Mask[i + 1];
5625
5626 // If both elements are undef, its trivial.
5627 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5628 WidenedMask[i / 2] = SM_SentinelUndef;
5629 continue;
5630 }
5631
5632 // Check for an undef mask and a mask value properly aligned to fit with
5633 // a pair of values. If we find such a case, use the non-undef mask's value.
5634 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5635 WidenedMask[i / 2] = M1 / 2;
5636 continue;
5637 }
5638 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5639 WidenedMask[i / 2] = M0 / 2;
5640 continue;
5641 }
5642
5643 // When zeroing, we need to spread the zeroing across both lanes to widen.
5644 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5645 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5646 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5647 WidenedMask[i / 2] = SM_SentinelZero;
5648 continue;
5649 }
5650 return false;
5651 }
5652
5653 // Finally check if the two mask values are adjacent and aligned with
5654 // a pair.
5655 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5656 WidenedMask[i / 2] = M0 / 2;
5657 continue;
5658 }
5659
5660 // Otherwise we can't safely widen the elements used in this shuffle.
5661 return false;
5662 }
5663 assert(WidenedMask.size() == Mask.size() / 2 &&((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5664, __PRETTY_FUNCTION__))
5664 "Incorrect size of mask after widening the elements!")((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5664, __PRETTY_FUNCTION__))
;
5665
5666 return true;
5667}
5668
5669static bool canWidenShuffleElements(ArrayRef<int> Mask,
5670 const APInt &Zeroable,
5671 bool V2IsZero,
5672 SmallVectorImpl<int> &WidenedMask) {
5673 // Create an alternative mask with info about zeroable elements.
5674 // Here we do not set undef elements as zeroable.
5675 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5676 if (V2IsZero) {
5677 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!")((!Zeroable.isNullValue() && "V2's non-undef elements are used?!"
) ? static_cast<void> (0) : __assert_fail ("!Zeroable.isNullValue() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5677, __PRETTY_FUNCTION__))
;
5678 for (int i = 0, Size = Mask.size(); i != Size; ++i)
5679 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5680 ZeroableMask[i] = SM_SentinelZero;
5681 }
5682 return canWidenShuffleElements(ZeroableMask, WidenedMask);
5683}
5684
5685static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5686 SmallVector<int, 32> WidenedMask;
5687 return canWidenShuffleElements(Mask, WidenedMask);
5688}
5689
5690// Attempt to narrow/widen shuffle mask until it matches the target number of
5691// elements.
5692static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
5693 SmallVectorImpl<int> &ScaledMask) {
5694 unsigned NumSrcElts = Mask.size();
5695 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&((((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts
) == 0) && "Illegal shuffle scale factor") ? static_cast
<void> (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5696, __PRETTY_FUNCTION__))
5696 "Illegal shuffle scale factor")((((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts
) == 0) && "Illegal shuffle scale factor") ? static_cast
<void> (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5696, __PRETTY_FUNCTION__))
;
5697
5698 // Narrowing is guaranteed to work.
5699 if (NumDstElts >= NumSrcElts) {
5700 int Scale = NumDstElts / NumSrcElts;
5701 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
5702 return true;
5703 }
5704
5705 // We have to repeat the widening until we reach the target size, but we can
5706 // split out the first widening as it sets up ScaledMask for us.
5707 if (canWidenShuffleElements(Mask, ScaledMask)) {
5708 while (ScaledMask.size() > NumDstElts) {
5709 SmallVector<int, 16> WidenedMask;
5710 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
5711 return false;
5712 ScaledMask = std::move(WidenedMask);
5713 }
5714 return true;
5715 }
5716
5717 return false;
5718}
5719
5720/// Returns true if Elt is a constant zero or a floating point constant +0.0.
5721bool X86::isZeroNode(SDValue Elt) {
5722 return isNullConstant(Elt) || isNullFPConstant(Elt);
5723}
5724
5725// Build a vector of constants.
5726// Use an UNDEF node if MaskElt == -1.
5727// Split 64-bit constants in the 32-bit mode.
5728static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5729 const SDLoc &dl, bool IsMask = false) {
5730
5731 SmallVector<SDValue, 32> Ops;
5732 bool Split = false;
5733
5734 MVT ConstVecVT = VT;
5735 unsigned NumElts = VT.getVectorNumElements();
5736 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5737 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5738 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5739 Split = true;
5740 }
5741
5742 MVT EltVT = ConstVecVT.getVectorElementType();
5743 for (unsigned i = 0; i < NumElts; ++i) {
5744 bool IsUndef = Values[i] < 0 && IsMask;
5745 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5746 DAG.getConstant(Values[i], dl, EltVT);
5747 Ops.push_back(OpNode);
5748 if (Split)
5749 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5750 DAG.getConstant(0, dl, EltVT));
5751 }
5752 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5753 if (Split)
5754 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5755 return ConstsNode;
5756}
5757
5758static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5759 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5760 assert(Bits.size() == Undefs.getBitWidth() &&((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"
) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5761, __PRETTY_FUNCTION__))
5761 "Unequal constant and undef arrays")((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"
) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5761, __PRETTY_FUNCTION__))
;
5762 SmallVector<SDValue, 32> Ops;
5763 bool Split = false;
5764
5765 MVT ConstVecVT = VT;
5766 unsigned NumElts = VT.getVectorNumElements();
5767 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5768 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5769 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5770 Split = true;
5771 }
5772
5773 MVT EltVT = ConstVecVT.getVectorElementType();
5774 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5775 if (Undefs[i]) {
5776 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5777 continue;
5778 }
5779 const APInt &V = Bits[i];
5780 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")((V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"
) ? static_cast<void> (0) : __assert_fail ("V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5780, __PRETTY_FUNCTION__))
;
5781 if (Split) {
5782 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5783 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5784 } else if (EltVT == MVT::f32) {
5785 APFloat FV(APFloat::IEEEsingle(), V);
5786 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5787 } else if (EltVT == MVT::f64) {
5788 APFloat FV(APFloat::IEEEdouble(), V);
5789 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5790 } else {
5791 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5792 }
5793 }
5794
5795 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5796 return DAG.getBitcast(VT, ConstsNode);
5797}
5798
5799/// Returns a vector of specified type with all zero elements.
5800static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5801 SelectionDAG &DAG, const SDLoc &dl) {
5802 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5804, __PRETTY_FUNCTION__))
5803 VT.getVectorElementType() == MVT::i1) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5804, __PRETTY_FUNCTION__))
5804 "Unexpected vector type")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5804, __PRETTY_FUNCTION__))
;
5805
5806 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5807 // type. This ensures they get CSE'd. But if the integer type is not
5808 // available, use a floating-point +0.0 instead.
5809 SDValue Vec;
5810 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5811 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5812 } else if (VT.isFloatingPoint()) {
5813 Vec = DAG.getConstantFP(+0.0, dl, VT);
5814 } else if (VT.getVectorElementType() == MVT::i1) {
5815 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type") ? static_cast<void> (0) : __assert_fail
("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5816, __PRETTY_FUNCTION__))
5816 "Unexpected vector type")(((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type") ? static_cast<void> (0) : __assert_fail
("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5816, __PRETTY_FUNCTION__))
;
5817 Vec = DAG.getConstant(0, dl, VT);
5818 } else {
5819 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5820 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5821 }
5822 return DAG.getBitcast(VT, Vec);
5823}
5824
5825static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5826 const SDLoc &dl, unsigned vectorWidth) {
5827 EVT VT = Vec.getValueType();
5828 EVT ElVT = VT.getVectorElementType();
5829 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5830 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5831 VT.getVectorNumElements()/Factor);
5832
5833 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5834 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5835 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5835, __PRETTY_FUNCTION__))
;
5836
5837 // This is the index of the first element of the vectorWidth-bit chunk
5838 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5839 IdxVal &= ~(ElemsPerChunk - 1);
5840
5841 // If the input is a buildvector just emit a smaller one.
5842 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5843 return DAG.getBuildVector(ResultVT, dl,
5844 Vec->ops().slice(IdxVal, ElemsPerChunk));
5845
5846 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5847 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5848}
5849
5850/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5851/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5852/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5853/// instructions or a simple subregister reference. Idx is an index in the
5854/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5855/// lowering EXTRACT_VECTOR_ELT operations easier.
5856static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5857 SelectionDAG &DAG, const SDLoc &dl) {
5858 assert((Vec.getValueType().is256BitVector() ||(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5859, __PRETTY_FUNCTION__))
5859 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5859, __PRETTY_FUNCTION__))
;
5860 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5861}
5862
5863/// Generate a DAG to grab 256-bits from a 512-bit vector.
5864static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5865 SelectionDAG &DAG, const SDLoc &dl) {
5866 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")((Vec.getValueType().is512BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5866, __PRETTY_FUNCTION__))
;
5867 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5868}
5869
5870static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5871 SelectionDAG &DAG, const SDLoc &dl,
5872 unsigned vectorWidth) {
5873 assert((vectorWidth == 128 || vectorWidth == 256) &&(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5874, __PRETTY_FUNCTION__))
5874 "Unsupported vector width")(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5874, __PRETTY_FUNCTION__))
;
5875 // Inserting UNDEF is Result
5876 if (Vec.isUndef())
5877 return Result;
5878 EVT VT = Vec.getValueType();
5879 EVT ElVT = VT.getVectorElementType();
5880 EVT ResultVT = Result.getValueType();
5881
5882 // Insert the relevant vectorWidth bits.
5883 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5884 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5884, __PRETTY_FUNCTION__))
;
5885
5886 // This is the index of the first element of the vectorWidth-bit chunk
5887 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5888 IdxVal &= ~(ElemsPerChunk - 1);
5889
5890 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5891 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5892}
5893
5894/// Generate a DAG to put 128-bits into a vector > 128 bits. This
5895/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5896/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5897/// simple superregister reference. Idx is an index in the 128 bits
5898/// we want. It need not be aligned to a 128-bit boundary. That makes
5899/// lowering INSERT_VECTOR_ELT operations easier.
5900static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5901 SelectionDAG &DAG, const SDLoc &dl) {
5902 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")((Vec.getValueType().is128BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5902, __PRETTY_FUNCTION__))
;
5903 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5904}
5905
5906/// Widen a vector to a larger size with the same scalar type, with the new
5907/// elements either zero or undef.
5908static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5909 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5910 const SDLoc &dl) {
5911 assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&((Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits
() && Vec.getValueType().getScalarType() == VT.getScalarType
() && "Unsupported vector widening type") ? static_cast
<void> (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5913, __PRETTY_FUNCTION__))
5912 Vec.getValueType().getScalarType() == VT.getScalarType() &&((Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits
() && Vec.getValueType().getScalarType() == VT.getScalarType
() && "Unsupported vector widening type") ? static_cast
<void> (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5913, __PRETTY_FUNCTION__))
5913 "Unsupported vector widening type")((Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits
() && Vec.getValueType().getScalarType() == VT.getScalarType
() && "Unsupported vector widening type") ? static_cast
<void> (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5913, __PRETTY_FUNCTION__))
;
5914 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5915 : DAG.getUNDEF(VT);
5916 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5917 DAG.getIntPtrConstant(0, dl));
5918}
5919
5920/// Widen a vector to a larger size with the same scalar type, with the new
5921/// elements either zero or undef.
5922static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
5923 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5924 const SDLoc &dl, unsigned WideSizeInBits) {
5925 assert(Vec.getValueSizeInBits() < WideSizeInBits &&((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5927, __PRETTY_FUNCTION__))
5926 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5927, __PRETTY_FUNCTION__))
5927 "Unsupported vector widening type")((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5927, __PRETTY_FUNCTION__))
;
5928 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
5929 MVT SVT = Vec.getSimpleValueType().getScalarType();
5930 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
5931 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
5932}
5933
5934// Helper function to collect subvector ops that are concatenated together,
5935// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
5936// The subvectors in Ops are guaranteed to be the same type.
5937static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
5938 assert(Ops.empty() && "Expected an empty ops vector")((Ops.empty() && "Expected an empty ops vector") ? static_cast
<void> (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5938, __PRETTY_FUNCTION__))
;
5939
5940 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
5941 Ops.append(N->op_begin(), N->op_end());
5942 return true;
5943 }
5944
5945 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
5946 SDValue Src = N->getOperand(0);
5947 SDValue Sub = N->getOperand(1);
5948 const APInt &Idx = N->getConstantOperandAPInt(2);
5949 EVT VT = Src.getValueType();
5950 EVT SubVT = Sub.getValueType();
5951
5952 // TODO - Handle more general insert_subvector chains.
5953 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
5954 Idx == (VT.getVectorNumElements() / 2)) {
5955 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
5956 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5957 Src.getOperand(1).getValueType() == SubVT &&
5958 isNullConstant(Src.getOperand(2))) {
5959 Ops.push_back(Src.getOperand(1));
5960 Ops.push_back(Sub);
5961 return true;
5962 }
5963 // insert_subvector(x, extract_subvector(x, lo), hi)
5964 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5965 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
5966 Ops.append(2, Sub);
5967 return true;
5968 }
5969 }
5970 }
5971
5972 return false;
5973}
5974
5975static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
5976 const SDLoc &dl) {
5977 EVT VT = Op.getValueType();
5978 unsigned NumElems = VT.getVectorNumElements();
5979 unsigned SizeInBits = VT.getSizeInBits();
5980 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
"Can't split odd sized vector") ? static_cast<void> (0
) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5981, __PRETTY_FUNCTION__))
5981 "Can't split odd sized vector")(((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
"Can't split odd sized vector") ? static_cast<void> (0
) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5981, __PRETTY_FUNCTION__))
;
5982
5983 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
5984 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
5985 return std::make_pair(Lo, Hi);
5986}
5987
5988// Split an unary integer op into 2 half sized ops.
5989static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
5990 EVT VT = Op.getValueType();
5991
5992 // Make sure we only try to split 256/512-bit types to avoid creating
5993 // narrow vectors.
5994 assert((Op.getOperand(0).getValueType().is256BitVector() ||(((Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand
(0).getValueType().is512BitVector()) && (VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported VT!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5996, __PRETTY_FUNCTION__))
5995 Op.getOperand(0).getValueType().is512BitVector()) &&(((Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand
(0).getValueType().is512BitVector()) && (VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported VT!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5996, __PRETTY_FUNCTION__))
5996 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(((Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand
(0).getValueType().is512BitVector()) && (VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported VT!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5996, __PRETTY_FUNCTION__))
;
5997 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==((Op.getOperand(0).getValueType().getVectorNumElements() == VT
.getVectorNumElements() && "Unexpected VTs!") ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5999, __PRETTY_FUNCTION__))
5998 VT.getVectorNumElements() &&((Op.getOperand(0).getValueType().getVectorNumElements() == VT
.getVectorNumElements() && "Unexpected VTs!") ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5999, __PRETTY_FUNCTION__))
5999 "Unexpected VTs!")((Op.getOperand(0).getValueType().getVectorNumElements() == VT
.getVectorNumElements() && "Unexpected VTs!") ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5999, __PRETTY_FUNCTION__))
;
6000
6001 SDLoc dl(Op);
6002
6003 // Extract the Lo/Hi vectors
6004 SDValue Lo, Hi;
6005 std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
6006
6007 EVT LoVT, HiVT;
6008 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6009 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6010 DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
6011 DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
6012}
6013
6014/// Break a binary integer operation into 2 half sized ops and then
6015/// concatenate the result back.
6016static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6017 EVT VT = Op.getValueType();
6018
6019 // Sanity check that all the types match.
6020 assert(Op.getOperand(0).getValueType() == VT &&((Op.getOperand(0).getValueType() == VT && Op.getOperand
(1).getValueType() == VT && "Unexpected VTs!") ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6021, __PRETTY_FUNCTION__))
6021 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")((Op.getOperand(0).getValueType() == VT && Op.getOperand
(1).getValueType() == VT && "Unexpected VTs!") ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6021, __PRETTY_FUNCTION__))
;
6022 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6022, __PRETTY_FUNCTION__))
;
6023
6024 SDLoc dl(Op);
6025
6026 // Extract the LHS Lo/Hi vectors
6027 SDValue LHS1, LHS2;
6028 std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
6029
6030 // Extract the RHS Lo/Hi vectors
6031 SDValue RHS1, RHS2;
6032 std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
6033
6034 EVT LoVT, HiVT;
6035 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6036 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6037 DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
6038 DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
6039}
6040
6041// Helper for splitting operands of an operation to legal target size and
6042// apply a function on each part.
6043// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6044// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6045// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6046// The argument Builder is a function that will be applied on each split part:
6047// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6048template <typename F>
6049SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6050 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6051 F Builder, bool CheckBWI = true) {
6052 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")((Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6052, __PRETTY_FUNCTION__))
;
6053 unsigned NumSubs = 1;
6054 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6055 (!CheckBWI && Subtarget.useAVX512Regs())) {
6056 if (VT.getSizeInBits() > 512) {
6057 NumSubs = VT.getSizeInBits() / 512;
6058 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6058, __PRETTY_FUNCTION__))
;
6059 }
6060 } else if (Subtarget.hasAVX2()) {
6061 if (VT.getSizeInBits() > 256) {
6062 NumSubs = VT.getSizeInBits() / 256;
6063 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 256) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6063, __PRETTY_FUNCTION__))
;
6064 }
6065 } else {
6066 if (VT.getSizeInBits() > 128) {
6067 NumSubs = VT.getSizeInBits() / 128;
6068 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 128) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6068, __PRETTY_FUNCTION__))
;
6069 }
6070 }
6071
6072 if (NumSubs == 1)
6073 return Builder(DAG, DL, Ops);
6074
6075 SmallVector<SDValue, 4> Subs;
6076 for (unsigned i = 0; i != NumSubs; ++i) {
6077 SmallVector<SDValue, 2> SubOps;
6078 for (SDValue Op : Ops) {
6079 EVT OpVT = Op.getValueType();
6080 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6081 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6082 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6083 }
6084 Subs.push_back(Builder(DAG, DL, SubOps));
6085 }
6086 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6087}
6088
6089/// Insert i1-subvector to i1-vector.
6090static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6091 const X86Subtarget &Subtarget) {
6092
6093 SDLoc dl(Op);
6094 SDValue Vec = Op.getOperand(0);
6095 SDValue SubVec = Op.getOperand(1);
6096 SDValue Idx = Op.getOperand(2);
6097 unsigned IdxVal = Op.getConstantOperandVal(2);
6098
6099 // Inserting undef is a nop. We can just return the original vector.
6100 if (SubVec.isUndef())
6101 return Vec;
6102
6103 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6104 return Op;
6105
6106 MVT OpVT = Op.getSimpleValueType();
6107 unsigned NumElems = OpVT.getVectorNumElements();
6108 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6109
6110 // Extend to natively supported kshift.
6111 MVT WideOpVT = OpVT;
6112 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6113 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6114
6115 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6116 // if necessary.
6117 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6118 // May need to promote to a legal type.
6119 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6120 DAG.getConstant(0, dl, WideOpVT),
6121 SubVec, Idx);
6122 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6123 }
6124
6125 MVT SubVecVT = SubVec.getSimpleValueType();
6126 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6127 assert(IdxVal + SubVecNumElems <= NumElems &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6129, __PRETTY_FUNCTION__))
6128 IdxVal % SubVecVT.getSizeInBits() == 0 &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6129, __PRETTY_FUNCTION__))
6129 "Unexpected index value in INSERT_SUBVECTOR")((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6129, __PRETTY_FUNCTION__))
;
6130
6131 SDValue Undef = DAG.getUNDEF(WideOpVT);
6132
6133 if (IdxVal == 0) {
6134 // Zero lower bits of the Vec
6135 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6136 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6137 ZeroIdx);
6138 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6139 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6140 // Merge them together, SubVec should be zero extended.
6141 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6142 DAG.getConstant(0, dl, WideOpVT),
6143 SubVec, ZeroIdx);
6144 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6145 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6146 }
6147
6148 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6149 Undef, SubVec, ZeroIdx);
6150
6151 if (Vec.isUndef()) {
6152 assert(IdxVal != 0 && "Unexpected index")((IdxVal != 0 && "Unexpected index") ? static_cast<
void> (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6152, __PRETTY_FUNCTION__))
;
6153 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6154 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6155 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6156 }
6157
6158 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6159 assert(IdxVal != 0 && "Unexpected index")((IdxVal != 0 && "Unexpected index") ? static_cast<
void> (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6159, __PRETTY_FUNCTION__))
;
6160 NumElems = WideOpVT.getVectorNumElements();
6161 unsigned ShiftLeft = NumElems - SubVecNumElems;
6162 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6163 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6164 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6165 if (ShiftRight != 0)
6166 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6167 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6168 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6169 }
6170
6171 // Simple case when we put subvector in the upper part
6172 if (IdxVal + SubVecNumElems == NumElems) {
6173 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6174 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6175 if (SubVecNumElems * 2 == NumElems) {
6176 // Special case, use legal zero extending insert_subvector. This allows
6177 // isel to optimize when bits are known zero.
6178 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6179 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6180 DAG.getConstant(0, dl, WideOpVT),
6181 Vec, ZeroIdx);
6182 } else {
6183 // Otherwise use explicit shifts to zero the bits.
6184 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6185 Undef, Vec, ZeroIdx);
6186 NumElems = WideOpVT.getVectorNumElements();
6187 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6188 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6189 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6190 }
6191 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6192 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6193 }
6194
6195 // Inserting into the middle is more complicated.
6196
6197 NumElems = WideOpVT.getVectorNumElements();
6198
6199 // Widen the vector if needed.
6200 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6201
6202 unsigned ShiftLeft = NumElems - SubVecNumElems;
6203 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6204
6205 // Do an optimization for the the most frequently used types.
6206 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6207 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6208 Mask0.flipAllBits();
6209 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6210 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6211 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6212 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6213 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6214 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6215 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6216 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6217
6218 // Reduce to original width if needed.
6219 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6220 }
6221
6222 // Clear the upper bits of the subvector and move it to its insert position.
6223 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6224 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6225 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6226 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6227
6228 // Isolate the bits below the insertion point.
6229 unsigned LowShift = NumElems - IdxVal;
6230 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6231 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6232 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6233 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6234
6235 // Isolate the bits after the last inserted bit.
6236 unsigned HighShift = IdxVal + SubVecNumElems;
6237 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6238 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6239 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6240 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6241
6242 // Now OR all 3 pieces together.
6243 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6244 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6245
6246 // Reduce to original width if needed.
6247 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6248}
6249
6250static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6251 const SDLoc &dl) {
6252 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")((V1.getValueType() == V2.getValueType() && "subvector type mismatch"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6252, __PRETTY_FUNCTION__))
;
6253 EVT SubVT = V1.getValueType();
6254 EVT SubSVT = SubVT.getScalarType();
6255 unsigned SubNumElts = SubVT.getVectorNumElements();
6256 unsigned SubVectorWidth = SubVT.getSizeInBits();
6257 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6258 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6259 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6260}
6261
6262/// Returns a vector of specified type with all bits set.
6263/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6264/// Then bitcast to their original type, ensuring they get CSE'd.
6265static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6266 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected a 128/256/512-bit vector type") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6267, __PRETTY_FUNCTION__))
6267 "Expected a 128/256/512-bit vector type")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected a 128/256/512-bit vector type") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6267, __PRETTY_FUNCTION__))
;
6268
6269 APInt Ones = APInt::getAllOnesValue(32);
6270 unsigned NumElts = VT.getSizeInBits() / 32;
6271 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6272 return DAG.getBitcast(VT, Vec);
6273}
6274
6275// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
6276static unsigned getOpcode_EXTEND(unsigned Opcode) {
6277 switch (Opcode) {
6278 case ISD::ANY_EXTEND:
6279 case ISD::ANY_EXTEND_VECTOR_INREG:
6280 return ISD::ANY_EXTEND;
6281 case ISD::ZERO_EXTEND:
6282 case ISD::ZERO_EXTEND_VECTOR_INREG:
6283 return ISD::ZERO_EXTEND;
6284 case ISD::SIGN_EXTEND:
6285 case ISD::SIGN_EXTEND_VECTOR_INREG:
6286 return ISD::SIGN_EXTEND;
6287 }
6288 llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6288)
;
6289}
6290
6291// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6292static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6293 switch (Opcode) {
6294 case ISD::ANY_EXTEND:
6295 case ISD::ANY_EXTEND_VECTOR_INREG:
6296 return ISD::ANY_EXTEND_VECTOR_INREG;
6297 case ISD::ZERO_EXTEND:
6298 case ISD::ZERO_EXTEND_VECTOR_INREG:
6299 return ISD::ZERO_EXTEND_VECTOR_INREG;
6300 case ISD::SIGN_EXTEND:
6301 case ISD::SIGN_EXTEND_VECTOR_INREG:
6302 return ISD::SIGN_EXTEND_VECTOR_INREG;
6303 }
6304 llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6304)
;
6305}
6306
6307static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6308 SDValue In, SelectionDAG &DAG) {
6309 EVT InVT = In.getValueType();
6310 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")((VT.isVector() && InVT.isVector() && "Expected vector VTs."
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6310, __PRETTY_FUNCTION__))
;
6311 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6313, __PRETTY_FUNCTION__))
6312 ISD::ZERO_EXTEND == Opcode) &&(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6313, __PRETTY_FUNCTION__))
6313 "Unknown extension opcode")(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6313, __PRETTY_FUNCTION__))
;
6314
6315 // For 256-bit vectors, we only need the lower (128-bit) input half.
6316 // For 512-bit vectors, we only need the lower input half or quarter.
6317 if (InVT.getSizeInBits() > 128) {
6318 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&((VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6319, __PRETTY_FUNCTION__))
6319 "Expected VTs to be the same size!")((VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6319, __PRETTY_FUNCTION__))
;
6320 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6321 In = extractSubVector(In, 0, DAG, DL,
6322 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6323 InVT = In.getValueType();
6324 }
6325
6326 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6327 Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6328
6329 return DAG.getNode(Opcode, DL, VT, In);
6330}
6331
6332// Match (xor X, -1) -> X.
6333// Match extract_subvector(xor X, -1) -> extract_subvector(X).
6334// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6335static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
6336 V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);
6337 if (V.getOpcode() == ISD::XOR &&
6338 ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6339 return V.getOperand(0);
6340 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6341 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6342 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6343 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6344 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6345 Not, V.getOperand(1));
6346 }
6347 }
6348 SmallVector<SDValue, 2> CatOps;
6349 if (collectConcatOps(V.getNode(), CatOps)) {
6350 for (SDValue &CatOp : CatOps) {
6351 SDValue NotCat = IsNOT(CatOp, DAG);
6352 if (!NotCat) return SDValue();
6353 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6354 }
6355 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6356 }
6357 return SDValue();
6358}
6359
6360void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6361 bool Lo, bool Unary) {
6362 assert(Mask.empty() && "Expected an empty shuffle mask vector")((Mask.empty() && "Expected an empty shuffle mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6362, __PRETTY_FUNCTION__))
;
6363 int NumElts = VT.getVectorNumElements();
6364 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6365 for (int i = 0; i < NumElts; ++i) {
6366 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6367 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6368 Pos += (Unary ? 0 : NumElts * (i % 2));
6369 Pos += (Lo ? 0 : NumEltsInLane / 2);
6370 Mask.push_back(Pos);
6371 }
6372}
6373
6374/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6375/// imposed by AVX and specific to the unary pattern. Example:
6376/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6377/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
6378void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6379 bool Lo) {
6380 assert(Mask.empty() && "Expected an empty shuffle mask vector")((Mask.empty() && "Expected an empty shuffle mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6380, __PRETTY_FUNCTION__))
;
6381 int NumElts = VT.getVectorNumElements();
6382 for (int i = 0; i < NumElts; ++i) {
6383 int Pos = i / 2;
6384 Pos += (Lo ? 0 : NumElts / 2);
6385 Mask.push_back(Pos);
6386 }
6387}
6388
6389/// Returns a vector_shuffle node for an unpackl operation.
6390static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
6391 SDValue V1, SDValue V2) {
6392 SmallVector<int, 8> Mask;
6393 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6394 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6395}
6396
6397/// Returns a vector_shuffle node for an unpackh operation.
6398static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
6399 SDValue V1, SDValue V2) {
6400 SmallVector<int, 8> Mask;
6401 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6402 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6403}
6404
6405/// Return a vector_shuffle of the specified vector of zero or undef vector.
6406/// This produces a shuffle where the low element of V2 is swizzled into the
6407/// zero/undef vector, landing at element Idx.
6408/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
6409static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6410 bool IsZero,
6411 const X86Subtarget &Subtarget,
6412 SelectionDAG &DAG) {
6413 MVT VT = V2.getSimpleValueType();
6414 SDValue V1 = IsZero
6415 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6416 int NumElems = VT.getVectorNumElements();
6417 SmallVector<int, 16> MaskVec(NumElems);
6418 for (int i = 0; i != NumElems; ++i)
6419 // If this is the insertion idx, put the low elt of V2 here.
6420 MaskVec[i] = (i == Idx) ? NumElems : i;
6421 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6422}
6423
6424static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6425 if (Ptr.getOpcode() == X86ISD::Wrapper ||
6426 Ptr.getOpcode() == X86ISD::WrapperRIP)
6427 Ptr = Ptr.getOperand(0);
6428
6429 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6430 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6431 return nullptr;
6432
6433 return CNode->getConstVal();
6434}
6435
6436static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6437 if (!Load || !ISD::isNormalLoad(Load))
6438 return nullptr;
6439 return getTargetConstantFromBasePtr(Load->getBasePtr());
6440}
6441
6442static const Constant *getTargetConstantFromNode(SDValue Op) {
6443 Op = peekThroughBitcasts(Op);
6444 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6445}
6446
6447const Constant *
6448X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6449 assert(LD && "Unexpected null LoadSDNode")((LD && "Unexpected null LoadSDNode") ? static_cast<
void> (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6449, __PRETTY_FUNCTION__))
;
6450 return getTargetConstantFromNode(LD);
6451}
6452
6453// Extract raw constant bits from constant pools.
6454static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6455 APInt &UndefElts,
6456 SmallVectorImpl<APInt> &EltBits,
6457 bool AllowWholeUndefs = true,
6458 bool AllowPartialUndefs = true) {
6459 assert(EltBits.empty() && "Expected an empty EltBits vector")((EltBits.empty() && "Expected an empty EltBits vector"
) ? static_cast<void> (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6459, __PRETTY_FUNCTION__))
;
6460
6461 Op = peekThroughBitcasts(Op);
6462
6463 EVT VT = Op.getValueType();
6464 unsigned SizeInBits = VT.getSizeInBits();
6465 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"
) ? static_cast<void> (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6465, __PRETTY_FUNCTION__))
;
6466 unsigned NumElts = SizeInBits / EltSizeInBits;
6467
6468 // Bitcast a source array of element bits to the target size.
6469 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6470 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6471 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6472 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"
) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6473, __PRETTY_FUNCTION__))
6473 "Constant bit sizes don't match")(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"
) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6473, __PRETTY_FUNCTION__))
;
6474
6475 // Don't split if we don't allow undef bits.
6476 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6477 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6478 return false;
6479
6480 // If we're already the right size, don't bother bitcasting.
6481 if (NumSrcElts == NumElts) {
6482 UndefElts = UndefSrcElts;
6483 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6484 return true;
6485 }
6486
6487 // Extract all the undef/constant element data and pack into single bitsets.
6488 APInt UndefBits(SizeInBits, 0);
6489 APInt MaskBits(SizeInBits, 0);
6490
6491 for (unsigned i = 0; i != NumSrcElts; ++i) {
6492 unsigned BitOffset = i * SrcEltSizeInBits;
6493 if (UndefSrcElts[i])
6494 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6495 MaskBits.insertBits(SrcEltBits[i], BitOffset);
6496 }
6497
6498 // Split the undef/constant single bitset data into the target elements.
6499 UndefElts = APInt(NumElts, 0);
6500 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6501
6502 for (unsigned i = 0; i != NumElts; ++i) {
6503 unsigned BitOffset = i * EltSizeInBits;
6504 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6505
6506 // Only treat an element as UNDEF if all bits are UNDEF.
6507 if (UndefEltBits.isAllOnesValue()) {
6508 if (!AllowWholeUndefs)
6509 return false;
6510 UndefElts.setBit(i);
6511 continue;
6512 }
6513
6514 // If only some bits are UNDEF then treat them as zero (or bail if not
6515 // supported).
6516 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6517 return false;
6518
6519 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6520 }
6521 return true;
6522 };
6523
6524 // Collect constant bits and insert into mask/undef bit masks.
6525 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6526 unsigned UndefBitIndex) {
6527 if (!Cst)
6528 return false;
6529 if (isa<UndefValue>(Cst)) {
6530 Undefs.setBit(UndefBitIndex);
6531 return true;
6532 }
6533 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6534 Mask = CInt->getValue();
6535 return true;
6536 }
6537 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6538 Mask = CFP->getValueAPF().bitcastToAPInt();
6539 return true;
6540 }
6541 return false;
6542 };
6543
6544 // Handle UNDEFs.
6545 if (Op.isUndef()) {
6546 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6547 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6548 return CastBitData(UndefSrcElts, SrcEltBits);
6549 }
6550
6551 // Extract scalar constant bits.
6552 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6553 APInt UndefSrcElts = APInt::getNullValue(1);
6554 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6555 return CastBitData(UndefSrcElts, SrcEltBits);
6556 }
6557 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6558 APInt UndefSrcElts = APInt::getNullValue(1);
6559 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6560 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6561 return CastBitData(UndefSrcElts, SrcEltBits);
6562 }
6563
6564 // Extract constant bits from build vector.
6565 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6566 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6567 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6568
6569 APInt UndefSrcElts(NumSrcElts, 0);
6570 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6571 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6572 const SDValue &Src = Op.getOperand(i);
6573 if (Src.isUndef()) {
6574 UndefSrcElts.setBit(i);
6575 continue;
6576 }
6577 auto *Cst = cast<ConstantSDNode>(Src);
6578 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6579 }
6580 return CastBitData(UndefSrcElts, SrcEltBits);
6581 }
6582 if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6583 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6584 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6585
6586 APInt UndefSrcElts(NumSrcElts, 0);
6587 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6588 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6589 const SDValue &Src = Op.getOperand(i);
6590 if (Src.isUndef()) {
6591 UndefSrcElts.setBit(i);
6592 continue;
6593 }
6594 auto *Cst = cast<ConstantFPSDNode>(Src);
6595 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6596 SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6597 }
6598 return CastBitData(UndefSrcElts, SrcEltBits);
6599 }
6600
6601 // Extract constant bits from constant pool vector.
6602 if (auto *Cst = getTargetConstantFromNode(Op)) {
6603 Type *CstTy = Cst->getType();
6604 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6605 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6606 return false;
6607
6608 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6609 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6610
6611 APInt UndefSrcElts(NumSrcElts, 0);
6612 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6613 for (unsigned i = 0; i != NumSrcElts; ++i)
6614 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6615 UndefSrcElts, i))
6616 return false;
6617
6618 return CastBitData(UndefSrcElts, SrcEltBits);
6619 }
6620
6621 // Extract constant bits from a broadcasted constant pool scalar.
6622 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6623 EltSizeInBits <= VT.getScalarSizeInBits()) {
6624 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6625 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6626 return false;
6627
6628 SDValue Ptr = MemIntr->getBasePtr();
6629 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
6630 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6631 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6632
6633 APInt UndefSrcElts(NumSrcElts, 0);
6634 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6635 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6636 if (UndefSrcElts[0])
6637 UndefSrcElts.setBits(0, NumSrcElts);
6638 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6639 return CastBitData(UndefSrcElts, SrcEltBits);
6640 }
6641 }
6642 }
6643
6644 // Extract constant bits from a subvector broadcast.
6645 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
6646 SmallVector<APInt, 16> SubEltBits;
6647 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6648 UndefElts, SubEltBits, AllowWholeUndefs,
6649 AllowPartialUndefs)) {
6650 UndefElts = APInt::getSplat(NumElts, UndefElts);
6651 while (EltBits.size() < NumElts)
6652 EltBits.append(SubEltBits.begin(), SubEltBits.end());
6653 return true;
6654 }
6655 }
6656
6657 // Extract a rematerialized scalar constant insertion.
6658 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6659 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6660 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6661 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6662 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6663
6664 APInt UndefSrcElts(NumSrcElts, 0);
6665 SmallVector<APInt, 64> SrcEltBits;
6666 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6667 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6668 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6669 return CastBitData(UndefSrcElts, SrcEltBits);
6670 }
6671
6672 // Insert constant bits from a base and sub vector sources.
6673 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
6674 // TODO - support insert_subvector through bitcasts.
6675 if (EltSizeInBits != VT.getScalarSizeInBits())
6676 return false;
6677
6678 APInt UndefSubElts;
6679 SmallVector<APInt, 32> EltSubBits;
6680 if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6681 UndefSubElts, EltSubBits,
6682 AllowWholeUndefs, AllowPartialUndefs) &&
6683 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6684 UndefElts, EltBits, AllowWholeUndefs,
6685 AllowPartialUndefs)) {
6686 unsigned BaseIdx = Op.getConstantOperandVal(2);
6687 UndefElts.insertBits(UndefSubElts, BaseIdx);
6688 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6689 EltBits[BaseIdx + i] = EltSubBits[i];
6690 return true;
6691 }
6692 }
6693
6694 // Extract constant bits from a subvector's source.
6695 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6696 // TODO - support extract_subvector through bitcasts.
6697 if (EltSizeInBits != VT.getScalarSizeInBits())
6698 return false;
6699
6700 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6701 UndefElts, EltBits, AllowWholeUndefs,
6702 AllowPartialUndefs)) {
6703 EVT SrcVT = Op.getOperand(0).getValueType();
6704 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6705 unsigned NumSubElts = VT.getVectorNumElements();
6706 unsigned BaseIdx = Op.getConstantOperandVal(1);
6707 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
6708 if ((BaseIdx + NumSubElts) != NumSrcElts)
6709 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
6710 if (BaseIdx != 0)
6711 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
6712 return true;
6713 }
6714 }
6715
6716 // Extract constant bits from shuffle node sources.
6717 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
6718 // TODO - support shuffle through bitcasts.
6719 if (EltSizeInBits != VT.getScalarSizeInBits())
6720 return false;
6721
6722 ArrayRef<int> Mask = SVN->getMask();
6723 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
6724 llvm::any_of(Mask, [](int M) { return M < 0; }))
6725 return false;
6726
6727 APInt UndefElts0, UndefElts1;
6728 SmallVector<APInt, 32> EltBits0, EltBits1;
6729 if (isAnyInRange(Mask, 0, NumElts) &&
6730 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6731 UndefElts0, EltBits0, AllowWholeUndefs,
6732 AllowPartialUndefs))
6733 return false;
6734 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
6735 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6736 UndefElts1, EltBits1, AllowWholeUndefs,
6737 AllowPartialUndefs))
6738 return false;
6739
6740 UndefElts = APInt::getNullValue(NumElts);
6741 for (int i = 0; i != (int)NumElts; ++i) {
6742 int M = Mask[i];
6743 if (M < 0) {
6744 UndefElts.setBit(i);
6745 EltBits.push_back(APInt::getNullValue(EltSizeInBits));
6746 } else if (M < (int)NumElts) {
6747 if (UndefElts0[M])
6748 UndefElts.setBit(i);
6749 EltBits.push_back(EltBits0[M]);
6750 } else {
6751 if (UndefElts1[M - NumElts])
6752 UndefElts.setBit(i);
6753 EltBits.push_back(EltBits1[M - NumElts]);
6754 }
6755 }
6756 return true;
6757 }
6758
6759 return false;
6760}
6761
6762namespace llvm {
6763namespace X86 {
6764bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
6765 APInt UndefElts;
6766 SmallVector<APInt, 16> EltBits;
6767 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
6768 UndefElts, EltBits, true,
6769 AllowPartialUndefs)) {
6770 int SplatIndex = -1;
6771 for (int i = 0, e = EltBits.size(); i != e; ++i) {
6772 if (UndefElts[i])
6773 continue;
6774 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6775 SplatIndex = -1;
6776 break;
6777 }
6778 SplatIndex = i;
6779 }
6780 if (0 <= SplatIndex) {
6781 SplatVal = EltBits[SplatIndex];
6782 return true;
6783 }
6784 }
6785
6786 return false;
6787}
6788} // namespace X86
6789} // namespace llvm
6790
6791static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6792 unsigned MaskEltSizeInBits,
6793 SmallVectorImpl<uint64_t> &RawMask,
6794 APInt &UndefElts) {
6795 // Extract the raw target constant bits.
6796 SmallVector<APInt, 64> EltBits;
6797 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6798 EltBits, /* AllowWholeUndefs */ true,
6799 /* AllowPartialUndefs */ false))
6800 return false;
6801
6802 // Insert the extracted elements into the mask.
6803 for (const APInt &Elt : EltBits)
6804 RawMask.push_back(Elt.getZExtValue());
6805
6806 return true;
6807}
6808
6809/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6810/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
6811/// Note: This ignores saturation, so inputs must be checked first.
6812static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6813 bool Unary, unsigned NumStages = 1) {
6814 assert(Mask.empty() && "Expected an empty shuffle mask vector")((Mask.empty() && "Expected an empty shuffle mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6814, __PRETTY_FUNCTION__))
;
6815 unsigned NumElts = VT.getVectorNumElements();
6816 unsigned NumLanes = VT.getSizeInBits() / 128;
6817 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6818 unsigned Offset = Unary ? 0 : NumElts;
6819 unsigned Repetitions = 1u << (NumStages - 1);
6820 unsigned Increment = 1u << NumStages;
6821 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6821, __PRETTY_FUNCTION__))
;
6822
6823 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6824 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
6825 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6826 Mask.push_back(Elt + (Lane * NumEltsPerLane));
6827 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6828 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6829 }
6830 }
6831}
6832
6833// Split the demanded elts of a PACKSS/PACKUS node between its operands.
6834static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6835 APInt &DemandedLHS, APInt &DemandedRHS) {
6836 int NumLanes = VT.getSizeInBits() / 128;
6837 int NumElts = DemandedElts.getBitWidth();
6838 int NumInnerElts = NumElts / 2;
6839 int NumEltsPerLane = NumElts / NumLanes;
6840 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6841
6842 DemandedLHS = APInt::getNullValue(NumInnerElts);
6843 DemandedRHS = APInt::getNullValue(NumInnerElts);
6844
6845 // Map DemandedElts to the packed operands.
6846 for (int Lane = 0; Lane != NumLanes; ++Lane) {
6847 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6848 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6849 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6850 if (DemandedElts[OuterIdx])
6851 DemandedLHS.setBit(InnerIdx);
6852 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6853 DemandedRHS.setBit(InnerIdx);
6854 }
6855 }
6856}
6857
6858// Split the demanded elts of a HADD/HSUB node between its operands.
6859static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
6860 APInt &DemandedLHS, APInt &DemandedRHS) {
6861 int NumLanes = VT.getSizeInBits() / 128;
6862 int NumElts = DemandedElts.getBitWidth();
6863 int NumEltsPerLane = NumElts / NumLanes;
6864 int HalfEltsPerLane = NumEltsPerLane / 2;
6865
6866 DemandedLHS = APInt::getNullValue(NumElts);
6867 DemandedRHS = APInt::getNullValue(NumElts);
6868
6869 // Map DemandedElts to the horizontal operands.
6870 for (int Idx = 0; Idx != NumElts; ++Idx) {
6871 if (!DemandedElts[Idx])
6872 continue;
6873 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
6874 int LocalIdx = Idx % NumEltsPerLane;
6875 if (LocalIdx < HalfEltsPerLane) {
6876 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6877 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6878 } else {
6879 LocalIdx -= HalfEltsPerLane;
6880 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6881 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6882 }
6883 }
6884}
6885
6886/// Calculates the shuffle mask corresponding to the target-specific opcode.
6887/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6888/// operands in \p Ops, and returns true.
6889/// Sets \p IsUnary to true if only one source is used. Note that this will set
6890/// IsUnary for shuffles which use a single input multiple times, and in those
6891/// cases it will adjust the mask to only have indices within that single input.
6892/// It is an error to call this with non-empty Mask/Ops vectors.
6893static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6894 SmallVectorImpl<SDValue> &Ops,
6895 SmallVectorImpl<int> &Mask, bool &IsUnary) {
6896 unsigned NumElems = VT.getVectorNumElements();
6897 unsigned MaskEltSize = VT.getScalarSizeInBits();
6898 SmallVector<uint64_t, 32> RawMask;
6899 APInt RawUndefs;
6900 uint64_t ImmN;
6901
6902 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")((Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6902, __PRETTY_FUNCTION__))
;
6903 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")((Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? static_cast<void> (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6903, __PRETTY_FUNCTION__))
;
6904
6905 IsUnary = false;
6906 bool IsFakeUnary = false;
6907 switch (N->getOpcode()) {
6908 case X86ISD::BLENDI:
6909 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6909, __PRETTY_FUNCTION__))
;
6910 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6910, __PRETTY_FUNCTION__))
;
6911 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6912 DecodeBLENDMask(NumElems, ImmN, Mask);
6913 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6914 break;
6915 case X86ISD::SHUFP:
6916 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6916, __PRETTY_FUNCTION__))
;
6917 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6917, __PRETTY_FUNCTION__))
;
6918 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6919 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
6920 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6921 break;
6922 case X86ISD::INSERTPS:
6923 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6923, __PRETTY_FUNCTION__))
;
6924 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6924, __PRETTY_FUNCTION__))
;
6925 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6926 DecodeINSERTPSMask(ImmN, Mask);
6927 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6928 break;
6929 case X86ISD::EXTRQI:
6930 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6930, __PRETTY_FUNCTION__))
;
6931 if (isa<ConstantSDNode>(N->getOperand(1)) &&
6932 isa<ConstantSDNode>(N->getOperand(2))) {
6933 int BitLen = N->getConstantOperandVal(1);
6934 int BitIdx = N->getConstantOperandVal(2);
6935 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
6936 IsUnary = true;
6937 }
6938 break;
6939 case X86ISD::INSERTQI:
6940 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6940, __PRETTY_FUNCTION__))
;
6941 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6941, __PRETTY_FUNCTION__))
;
6942 if (isa<ConstantSDNode>(N->getOperand(2)) &&
6943 isa<ConstantSDNode>(N->getOperand(3))) {
6944 int BitLen = N->getConstantOperandVal(2);
6945 int BitIdx = N->getConstantOperandVal(3);
6946 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
6947 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6948 }
6949 break;
6950 case X86ISD::UNPCKH:
6951 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6951, __PRETTY_FUNCTION__))
;
6952 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6952, __PRETTY_FUNCTION__))
;
6953 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
6954 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6955 break;
6956 case X86ISD::UNPCKL:
6957 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6957, __PRETTY_FUNCTION__))
;
6958 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6958, __PRETTY_FUNCTION__))
;
6959 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
6960 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6961 break;
6962 case X86ISD::MOVHLPS:
6963 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6963, __PRETTY_FUNCTION__))
;
6964 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6964, __PRETTY_FUNCTION__))
;
6965 DecodeMOVHLPSMask(NumElems, Mask);
6966 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6967 break;
6968 case X86ISD::MOVLHPS:
6969 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6969, __PRETTY_FUNCTION__))
;
6970 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6970, __PRETTY_FUNCTION__))
;
6971 DecodeMOVLHPSMask(NumElems, Mask);
6972 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6973 break;
6974 case X86ISD::VALIGN:
6975 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6976, __PRETTY_FUNCTION__))
6976 "Only 32-bit and 64-bit elements are supported!")(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6976, __PRETTY_FUNCTION__))
;
6977 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6977, __PRETTY_FUNCTION__))
;
6978 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6978, __PRETTY_FUNCTION__))
;
6979 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6980 DecodeVALIGNMask(NumElems, ImmN, Mask);
6981 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6982 Ops.push_back(N->getOperand(1));
6983 Ops.push_back(N->getOperand(0));
6984 break;
6985 case X86ISD::PALIGNR:
6986 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6986, __PRETTY_FUNCTION__))
;
6987 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6987, __PRETTY_FUNCTION__))
;
6988 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6988, __PRETTY_FUNCTION__))
;
6989 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6990 DecodePALIGNRMask(NumElems, ImmN, Mask);
6991 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6992 Ops.push_back(N->getOperand(1));
6993 Ops.push_back(N->getOperand(0));
6994 break;
6995 case X86ISD::VSHLDQ:
6996 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6996, __PRETTY_FUNCTION__))
;
6997 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6997, __PRETTY_FUNCTION__))
;
6998 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6999 DecodePSLLDQMask(NumElems, ImmN, Mask);
7000 IsUnary = true;
7001 break;
7002 case X86ISD::VSRLDQ:
7003 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7003, __PRETTY_FUNCTION__))
;
7004 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7004, __PRETTY_FUNCTION__))
;
7005 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7006 DecodePSRLDQMask(NumElems, ImmN, Mask);
7007 IsUnary = true;
7008 break;
7009 case X86ISD::PSHUFD:
7010 case X86ISD::VPERMILPI:
7011 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7011, __PRETTY_FUNCTION__))
;
7012 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7013 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7014 IsUnary = true;
7015 break;
7016 case X86ISD::PSHUFHW:
7017 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7017, __PRETTY_FUNCTION__))
;
7018 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7019 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7020 IsUnary = true;
7021 break;
7022 case X86ISD::PSHUFLW:
7023 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7023, __PRETTY_FUNCTION__))
;
7024 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7025 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7026 IsUnary = true;
7027 break;
7028 case X86ISD::VZEXT_MOVL:
7029 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7029, __PRETTY_FUNCTION__))
;
7030 DecodeZeroMoveLowMask(NumElems, Mask);
7031 IsUnary = true;
7032 break;
7033 case X86ISD::VBROADCAST:
7034 // We only decode broadcasts of same-sized vectors, peeking through to
7035 // extracted subvectors is likely to cause hasOneUse issues with
7036 // SimplifyDemandedBits etc.
7037 if (N->getOperand(0).getValueType() == VT) {
7038 DecodeVectorBroadcast(NumElems, Mask);
7039 IsUnary = true;
7040 break;
7041 }
7042 return false;
7043 case X86ISD::VPERMILPV: {
7044 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7044, __PRETTY_FUNCTION__))
;
7045 IsUnary = true;
7046 SDValue MaskNode = N->getOperand(1);
7047 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7048 RawUndefs)) {
7049 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7050 break;
7051 }
7052 return false;
7053 }
7054 case X86ISD::PSHUFB: {
7055 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7055, __PRETTY_FUNCTION__))
;
7056 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7056, __PRETTY_FUNCTION__))
;
7057 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7057, __PRETTY_FUNCTION__))
;
7058 IsUnary = true;
7059 SDValue MaskNode = N->getOperand(1);
7060 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7061 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7062 break;
7063 }
7064 return false;
7065 }
7066 case X86ISD::VPERMI:
7067 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7067, __PRETTY_FUNCTION__))
;
7068 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7069 DecodeVPERMMask(NumElems, ImmN, Mask);
7070 IsUnary = true;
7071 break;
7072 case X86ISD::MOVSS:
7073 case X86ISD::MOVSD:
7074 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7074, __PRETTY_FUNCTION__))
;
7075 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7075, __PRETTY_FUNCTION__))
;
7076 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7077 break;
7078 case X86ISD::VPERM2X128:
7079 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7079, __PRETTY_FUNCTION__))
;
7080 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7080, __PRETTY_FUNCTION__))
;
7081 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7082 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7083 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7084 break;
7085 case X86ISD::SHUF128:
7086 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7086, __PRETTY_FUNCTION__))
;
7087 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7087, __PRETTY_FUNCTION__))
;
7088 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7089 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7090 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7091 break;
7092 case X86ISD::MOVSLDUP:
7093 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7093, __PRETTY_FUNCTION__))
;
7094 DecodeMOVSLDUPMask(NumElems, Mask);
7095 IsUnary = true;
7096 break;
7097 case X86ISD::MOVSHDUP:
7098 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7098, __PRETTY_FUNCTION__))
;
7099 DecodeMOVSHDUPMask(NumElems, Mask);
7100 IsUnary = true;
7101 break;
7102 case X86ISD::MOVDDUP:
7103 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7103, __PRETTY_FUNCTION__))
;
7104 DecodeMOVDDUPMask(NumElems, Mask);
7105 IsUnary = true;
7106 break;
7107 case X86ISD::VPERMIL2: {
7108 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7108, __PRETTY_FUNCTION__))
;
7109 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7109, __PRETTY_FUNCTION__))
;
7110 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7111 SDValue MaskNode = N->getOperand(2);
7112 SDValue CtrlNode = N->getOperand(3);
7113 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7114 unsigned CtrlImm = CtrlOp->getZExtValue();
7115 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7116 RawUndefs)) {
7117 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7118 Mask);
7119 break;
7120 }
7121 }
7122 return false;
7123 }
7124 case X86ISD::VPPERM: {
7125 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7125, __PRETTY_FUNCTION__))
;
7126 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7126, __PRETTY_FUNCTION__))
;
7127 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7128 SDValue MaskNode = N->getOperand(2);
7129 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7130 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7131 break;
7132 }
7133 return false;
7134 }
7135 case X86ISD::VPERMV: {
7136 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7136, __PRETTY_FUNCTION__))
;
7137 IsUnary = true;
7138 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7139 Ops.push_back(N->getOperand(1));
7140 SDValue MaskNode = N->getOperand(0);
7141 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7142 RawUndefs)) {
7143 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7144 break;
7145 }
7146 return false;
7147 }
7148 case X86ISD::VPERMV3: {
7149 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7149, __PRETTY_FUNCTION__))
;
7150 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")((N->getOperand(2).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7150, __PRETTY_FUNCTION__))
;
7151 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7152 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7153 Ops.push_back(N->getOperand(0));
7154 Ops.push_back(N->getOperand(2));
7155 SDValue MaskNode = N->getOperand(1);
7156 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7157 RawUndefs)) {
7158 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7159 break;
7160 }
7161 return false;
7162 }
7163 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7163)
;
7164 }
7165
7166 // Empty mask indicates the decode failed.
7167 if (Mask.empty())
7168 return false;
7169
7170 // Check if we're getting a shuffle mask with zero'd elements.
7171 if (!AllowSentinelZero && isAnyZero(Mask))
7172 return false;
7173
7174 // If we have a fake unary shuffle, the shuffle mask is spread across two
7175 // inputs that are actually the same node. Re-map the mask to always point
7176 // into the first input.
7177 if (IsFakeUnary)
7178 for (int &M : Mask)
7179 if (M >= (int)Mask.size())
7180 M -= Mask.size();
7181
7182 // If we didn't already add operands in the opcode-specific code, default to
7183 // adding 1 or 2 operands starting at 0.
7184 if (Ops.empty()) {
7185 Ops.push_back(N->getOperand(0));
7186 if (!IsUnary || IsFakeUnary)
7187 Ops.push_back(N->getOperand(1));
7188 }
7189
7190 return true;
7191}
7192
7193/// Compute whether each element of a shuffle is zeroable.
7194///
7195/// A "zeroable" vector shuffle element is one which can be lowered to zero.
7196/// Either it is an undef element in the shuffle mask, the element of the input
7197/// referenced is undef, or the element of the input referenced is known to be
7198/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7199/// as many lanes with this technique as possible to simplify the remaining
7200/// shuffle.
7201static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7202 SDValue V1, SDValue V2,
7203 APInt &KnownUndef, APInt &KnownZero) {
7204 int Size = Mask.size();
7205 KnownUndef = KnownZero = APInt::getNullValue(Size);
7206
7207 V1 = peekThroughBitcasts(V1);
7208 V2 = peekThroughBitcasts(V2);
7209
7210 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7211 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7212
7213 int VectorSizeInBits = V1.getValueSizeInBits();
7214 int ScalarSizeInBits = VectorSizeInBits / Size;
7215 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")((!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"
) ? static_cast<void> (0) : __assert_fail ("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7215, __PRETTY_FUNCTION__))
;
7216
7217 for (int i = 0; i < Size; ++i) {
7218 int M = Mask[i];
7219 // Handle the easy cases.
7220 if (M < 0) {
7221 KnownUndef.setBit(i);
7222 continue;
7223 }
7224 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7225 KnownZero.setBit(i);
7226 continue;
7227 }
7228
7229 // Determine shuffle input and normalize the mask.
7230 SDValue V = M < Size ? V1 : V2;
7231 M %= Size;
7232
7233 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7234 if (V.getOpcode() != ISD::BUILD_VECTOR)
7235 continue;
7236
7237 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7238 // the (larger) source element must be UNDEF/ZERO.
7239 if ((Size % V.getNumOperands()) == 0) {
7240 int Scale = Size / V->getNumOperands();
7241 SDValue Op = V.getOperand(M / Scale);
7242 if (Op.isUndef())
7243 KnownUndef.setBit(i);
7244 if (X86::isZeroNode(Op))
7245 KnownZero.setBit(i);
7246 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7247 APInt Val = Cst->getAPIntValue();
7248 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7249 if (Val == 0)
7250 KnownZero.setBit(i);
7251 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7252 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7253 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7254 if (Val == 0)
7255 KnownZero.setBit(i);
7256 }
7257 continue;
7258 }
7259
7260 // If the BUILD_VECTOR has more elements then all the (smaller) source
7261 // elements must be UNDEF or ZERO.
7262 if ((V.getNumOperands() % Size) == 0) {
7263 int Scale = V->getNumOperands() / Size;
7264 bool AllUndef = true;
7265 bool AllZero = true;
7266 for (int j = 0; j < Scale; ++j) {
7267 SDValue Op = V.getOperand((M * Scale) + j);
7268 AllUndef &= Op.isUndef();
7269 AllZero &= X86::isZeroNode(Op);
7270 }
7271 if (AllUndef)
7272 KnownUndef.setBit(i);
7273 if (AllZero)
7274 KnownZero.setBit(i);
7275 continue;
7276 }
7277 }
7278}
7279
7280/// Decode a target shuffle mask and inputs and see if any values are
7281/// known to be undef or zero from their inputs.
7282/// Returns true if the target shuffle mask was decoded.
7283/// FIXME: Merge this with computeZeroableShuffleElements?
7284static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7285 SmallVectorImpl<SDValue> &Ops,
7286 APInt &KnownUndef, APInt &KnownZero) {
7287 bool IsUnary;
7288 if (!isTargetShuffle(N.getOpcode()))
7289 return false;
7290
7291 MVT VT = N.getSimpleValueType();
7292 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7293 return false;
7294
7295 int Size = Mask.size();
7296 SDValue V1 = Ops[0];
7297 SDValue V2 = IsUnary ? V1 : Ops[1];
7298 KnownUndef = KnownZero = APInt::getNullValue(Size);
7299
7300 V1 = peekThroughBitcasts(V1);
7301 V2 = peekThroughBitcasts(V2);
7302
7303 assert((VT.getSizeInBits() % Size) == 0 &&(((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7304, __PRETTY_FUNCTION__))
7304 "Illegal split of shuffle value type")(((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7304, __PRETTY_FUNCTION__))
;
7305 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7306
7307 // Extract known constant input data.
7308 APInt UndefSrcElts[2];
7309 SmallVector<APInt, 32> SrcEltBits[2];
7310 bool IsSrcConstant[2] = {
7311 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7312 SrcEltBits[0], true, false),
7313 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7314 SrcEltBits[1], true, false)};
7315
7316 for (int i = 0; i < Size; ++i) {
7317 int M = Mask[i];
7318
7319 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7320 if (M < 0) {
7321 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")((isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7321, __PRETTY_FUNCTION__))
;
7322 if (SM_SentinelUndef == M)
7323 KnownUndef.setBit(i);
7324 if (SM_SentinelZero == M)
7325 KnownZero.setBit(i);
7326 continue;
7327 }
7328
7329 // Determine shuffle input and normalize the mask.
7330 unsigned SrcIdx = M / Size;
7331 SDValue V = M < Size ? V1 : V2;
7332 M %= Size;
7333
7334 // We are referencing an UNDEF input.
7335 if (V.isUndef()) {
7336 KnownUndef.setBit(i);
7337 continue;
7338 }
7339
7340 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7341 // TODO: We currently only set UNDEF for integer types - floats use the same
7342 // registers as vectors and many of the scalar folded loads rely on the
7343 // SCALAR_TO_VECTOR pattern.
7344 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7345 (Size % V.getValueType().getVectorNumElements()) == 0) {
7346 int Scale = Size / V.getValueType().getVectorNumElements();
7347 int Idx = M / Scale;
7348 if (Idx != 0 && !VT.isFloatingPoint())
7349 KnownUndef.setBit(i);
7350 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7351 KnownZero.setBit(i);
7352 continue;
7353 }
7354
7355 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7356 // base vectors.
7357 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7358 SDValue Vec = V.getOperand(0);
7359 int NumVecElts = Vec.getValueType().getVectorNumElements();
7360 if (Vec.isUndef() && Size == NumVecElts) {
7361 int Idx = V.getConstantOperandVal(2);
7362 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7363 if (M < Idx || (Idx + NumSubElts) <= M)
7364 KnownUndef.setBit(i);
7365 }
7366 continue;
7367 }
7368
7369 // Attempt to extract from the source's constant bits.
7370 if (IsSrcConstant[SrcIdx]) {
7371 if (UndefSrcElts[SrcIdx][M])
7372 KnownUndef.setBit(i);
7373 else if (SrcEltBits[SrcIdx][M] == 0)
7374 KnownZero.setBit(i);
7375 }
7376 }
7377
7378 assert(VT.getVectorNumElements() == (unsigned)Size &&((VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7379, __PRETTY_FUNCTION__))
7379 "Different mask size from vector size!")((VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7379, __PRETTY_FUNCTION__))
;
7380 return true;
7381}
7382
7383// Replace target shuffle mask elements with known undef/zero sentinels.
7384static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7385 const APInt &KnownUndef,
7386 const APInt &KnownZero,
7387 bool ResolveKnownZeros= true) {
7388 unsigned NumElts = Mask.size();
7389 assert(KnownUndef.getBitWidth() == NumElts &&((KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth
() == NumElts && "Shuffle mask size mismatch") ? static_cast
<void> (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7390, __PRETTY_FUNCTION__))
57
Assuming the condition is true
58
Assuming the condition is true
59
'?' condition is true
7390 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")((KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth
() == NumElts && "Shuffle mask size mismatch") ? static_cast
<void> (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7390, __PRETTY_FUNCTION__))
;
7391
7392 for (unsigned i = 0; i != NumElts; ++i) {
60
Assuming 'i' is equal to 'NumElts'
61
Loop condition is false. Execution continues on line 7392
7393 if (KnownUndef[i])
7394 Mask[i] = SM_SentinelUndef;
7395 else if (ResolveKnownZeros && KnownZero[i])
7396 Mask[i] = SM_SentinelZero;
7397 }
7398}
62
Returning without writing to 'Mask.Size'
7399
7400// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7401static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7402 APInt &KnownUndef,
7403 APInt &KnownZero) {
7404 unsigned NumElts = Mask.size();
7405 KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7406
7407 for (unsigned i = 0; i != NumElts; ++i) {
7408 int M = Mask[i];
7409 if (SM_SentinelUndef == M)
7410 KnownUndef.setBit(i);
7411 if (SM_SentinelZero == M)
7412 KnownZero.setBit(i);
7413 }
7414}
7415
7416// Forward declaration (for getFauxShuffleMask recursive check).
7417// TODO: Use DemandedElts variant.
7418static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7419 SmallVectorImpl<int> &Mask,
7420 const SelectionDAG &DAG, unsigned Depth,
7421 bool ResolveKnownElts);
7422
7423// Attempt to decode ops that could be represented as a shuffle mask.
7424// The decoded shuffle mask may contain a different number of elements to the
7425// destination value type.
7426static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7427 SmallVectorImpl<int> &Mask,
7428 SmallVectorImpl<SDValue> &Ops,
7429 const SelectionDAG &DAG, unsigned Depth,
7430 bool ResolveKnownElts) {
7431 Mask.clear();
7432 Ops.clear();
7433
7434 MVT VT = N.getSimpleValueType();
7435 unsigned NumElts = VT.getVectorNumElements();
7436 unsigned NumSizeInBits = VT.getSizeInBits();
7437 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7438 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7439 return false;
7440 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")((NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"
) ? static_cast<void> (0) : __assert_fail ("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7440, __PRETTY_FUNCTION__))
;
7441 unsigned NumSizeInBytes = NumSizeInBits / 8;
7442 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7443
7444 unsigned Opcode = N.getOpcode();
7445 switch (Opcode) {
7446 case ISD::VECTOR_SHUFFLE: {
7447 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7448 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7449 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7450 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7451 Ops.push_back(N.getOperand(0));
7452 Ops.push_back(N.getOperand(1));
7453 return true;
7454 }
7455 return false;
7456 }
7457 case ISD::AND:
7458 case X86ISD::ANDNP: {
7459 // Attempt to decode as a per-byte mask.
7460 APInt UndefElts;
7461 SmallVector<APInt, 32> EltBits;
7462 SDValue N0 = N.getOperand(0);
7463 SDValue N1 = N.getOperand(1);
7464 bool IsAndN = (X86ISD::ANDNP == Opcode);
7465 uint64_t ZeroMask = IsAndN ? 255 : 0;
7466 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7467 return false;
7468 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7469 if (UndefElts[i]) {
7470 Mask.push_back(SM_SentinelUndef);
7471 continue;
7472 }
7473 const APInt &ByteBits = EltBits[i];
7474 if (ByteBits != 0 && ByteBits != 255)
7475 return false;
7476 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7477 }
7478 Ops.push_back(IsAndN ? N1 : N0);
7479 return true;
7480 }
7481 case ISD::OR: {
7482 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7483 // is a valid shuffle index.
7484 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
7485 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
7486 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7487 return false;
7488 SmallVector<int, 64> SrcMask0, SrcMask1;
7489 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7490 if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7491 true) ||
7492 !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7493 true))
7494 return false;
7495
7496 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7497 SmallVector<int, 64> Mask0, Mask1;
7498 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7499 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7500 for (int i = 0; i != (int)MaskSize; ++i) {
7501 if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
7502 Mask.push_back(SM_SentinelUndef);
7503 else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7504 Mask.push_back(SM_SentinelZero);
7505 else if (Mask1[i] == SM_SentinelZero)
7506 Mask.push_back(i);
7507 else if (Mask0[i] == SM_SentinelZero)
7508 Mask.push_back(i + MaskSize);
7509 else
7510 return false;
7511 }
7512 Ops.push_back(N0);
7513 Ops.push_back(N1);
7514 return true;
7515 }
7516 case ISD::INSERT_SUBVECTOR: {
7517 SDValue Src = N.getOperand(0);
7518 SDValue Sub = N.getOperand(1);
7519 EVT SubVT = Sub.getValueType();
7520 unsigned NumSubElts = SubVT.getVectorNumElements();
7521 if (!N->isOnlyUserOf(Sub.getNode()))
7522 return false;
7523 uint64_t InsertIdx = N.getConstantOperandVal(2);
7524 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7525 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7526 Sub.getOperand(0).getValueType() == VT) {
7527 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7528 for (int i = 0; i != (int)NumElts; ++i)
7529 Mask.push_back(i);
7530 for (int i = 0; i != (int)NumSubElts; ++i)
7531 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7532 Ops.push_back(Src);
7533 Ops.push_back(Sub.getOperand(0));
7534 return true;
7535 }
7536 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7537 SmallVector<int, 64> SubMask;
7538 SmallVector<SDValue, 2> SubInputs;
7539 if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7540 SubMask, DAG, Depth + 1, ResolveKnownElts))
7541 return false;
7542
7543 // Subvector shuffle inputs must not be larger than the subvector.
7544 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
7545 return SubVT.getFixedSizeInBits() <
7546 SubInput.getValueSizeInBits().getFixedSize();
7547 }))
7548 return false;
7549
7550 if (SubMask.size() != NumSubElts) {
7551 assert(((SubMask.size() % NumSubElts) == 0 ||((((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask
.size()) == 0) && "Illegal submask scale") ? static_cast
<void> (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7552, __PRETTY_FUNCTION__))
7552 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")((((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask
.size()) == 0) && "Illegal submask scale") ? static_cast
<void> (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7552, __PRETTY_FUNCTION__))
;
7553 if ((NumSubElts % SubMask.size()) == 0) {
7554 int Scale = NumSubElts / SubMask.size();
7555 SmallVector<int,64> ScaledSubMask;
7556 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
7557 SubMask = ScaledSubMask;
7558 } else {
7559 int Scale = SubMask.size() / NumSubElts;
7560 NumSubElts = SubMask.size();
7561 NumElts *= Scale;
7562 InsertIdx *= Scale;
7563 }
7564 }
7565 Ops.push_back(Src);
7566 Ops.append(SubInputs.begin(), SubInputs.end());
7567 if (ISD::isBuildVectorAllZeros(Src.getNode()))
7568 Mask.append(NumElts, SM_SentinelZero);
7569 else
7570 for (int i = 0; i != (int)NumElts; ++i)
7571 Mask.push_back(i);
7572 for (int i = 0; i != (int)NumSubElts; ++i) {
7573 int M = SubMask[i];
7574 if (0 <= M) {
7575 int InputIdx = M / NumSubElts;
7576 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7577 }
7578 Mask[i + InsertIdx] = M;
7579 }
7580 return true;
7581 }
7582 case X86ISD::PINSRB:
7583 case X86ISD::PINSRW:
7584 case ISD::SCALAR_TO_VECTOR:
7585 case ISD::INSERT_VECTOR_ELT: {
7586 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
7587 // vector, for matching src/dst vector types.
7588 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
7589
7590 unsigned DstIdx = 0;
7591 if (Opcode != ISD::SCALAR_TO_VECTOR) {
7592 // Check we have an in-range constant insertion index.
7593 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7594 N.getConstantOperandAPInt(2).uge(NumElts))
7595 return false;
7596 DstIdx = N.getConstantOperandVal(2);
7597
7598 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
7599 if (X86::isZeroNode(Scl)) {
7600 Ops.push_back(N.getOperand(0));
7601 for (unsigned i = 0; i != NumElts; ++i)
7602 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
7603 return true;
7604 }
7605 }
7606
7607 // Peek through trunc/aext/zext.
7608 // TODO: aext shouldn't require SM_SentinelZero padding.
7609 // TODO: handle shift of scalars.
7610 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
7611 while (Scl.getOpcode() == ISD::TRUNCATE ||
7612 Scl.getOpcode() == ISD::ANY_EXTEND ||
7613 Scl.getOpcode() == ISD::ZERO_EXTEND) {
7614 Scl = Scl.getOperand(0);
7615 MinBitsPerElt =
7616 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
7617 }
7618 if ((MinBitsPerElt % 8) != 0)
7619 return false;
7620
7621 // Attempt to find the source vector the scalar was extracted from.
7622 SDValue SrcExtract;
7623 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
7624 Scl.getOpcode() == X86ISD::PEXTRW ||
7625 Scl.getOpcode() == X86ISD::PEXTRB) &&
7626 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
7627 SrcExtract = Scl;
7628 }
7629 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7630 return false;
7631
7632 SDValue SrcVec = SrcExtract.getOperand(0);
7633 EVT SrcVT = SrcVec.getValueType();
7634 if (!SrcVT.getScalarType().isByteSized())
7635 return false;
7636 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7637 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
7638 unsigned DstByte = DstIdx * NumBytesPerElt;
7639 MinBitsPerElt =
7640 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
7641
7642 // Create 'identity' byte level shuffle mask and then add inserted bytes.
7643 if (Opcode == ISD::SCALAR_TO_VECTOR) {
7644 Ops.push_back(SrcVec);
7645 Mask.append(NumSizeInBytes, SM_SentinelUndef);
7646 } else {
7647 Ops.push_back(SrcVec);
7648 Ops.push_back(N.getOperand(0));
7649 for (int i = 0; i != (int)NumSizeInBytes; ++i)
7650 Mask.push_back(NumSizeInBytes + i);
7651 }
7652
7653 unsigned MinBytesPerElts = MinBitsPerElt / 8;
7654 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
7655 for (unsigned i = 0; i != MinBytesPerElts; ++i)
7656 Mask[DstByte + i] = SrcByte + i;
7657 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
7658 Mask[DstByte + i] = SM_SentinelZero;
7659 return true;
7660 }
7661 case X86ISD::PACKSS:
7662 case X86ISD::PACKUS: {
7663 SDValue N0 = N.getOperand(0);
7664 SDValue N1 = N.getOperand(1);
7665 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7667, __PRETTY_FUNCTION__))
7666 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7667, __PRETTY_FUNCTION__))
7667 "Unexpected input value type")((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7667, __PRETTY_FUNCTION__))
;
7668
7669 APInt EltsLHS, EltsRHS;
7670 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7671
7672 // If we know input saturation won't happen (or we don't care for particular
7673 // lanes), we can treat this as a truncation shuffle.
7674 if (Opcode == X86ISD::PACKSS) {
7675 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7676 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7677 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7678 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7679 return false;
7680 } else {
7681 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
7682 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7683 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
7684 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7685 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
7686 return false;
7687 }
7688
7689 bool IsUnary = (N0 == N1);
7690
7691 Ops.push_back(N0);
7692 if (!IsUnary)
7693 Ops.push_back(N1);
7694
7695 createPackShuffleMask(VT, Mask, IsUnary);
7696 return true;
7697 }
7698 case X86ISD::VTRUNC: {
7699 SDValue Src = N.getOperand(0);
7700 EVT SrcVT = Src.getValueType();
7701 // Truncated source must be a simple vector.
7702 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7703 (SrcVT.getScalarSizeInBits() % 8) != 0)
7704 return false;
7705 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7706 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7707 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
7708 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7708, __PRETTY_FUNCTION__))
;
7709 for (unsigned i = 0; i != NumSrcElts; ++i)
7710 Mask.push_back(i * Scale);
7711 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
7712 Ops.push_back(Src);
7713 return true;
7714 }
7715 case X86ISD::VSHLI:
7716 case X86ISD::VSRLI: {
7717 uint64_t ShiftVal = N.getConstantOperandVal(1);
7718 // Out of range bit shifts are guaranteed to be zero.
7719 if (NumBitsPerElt <= ShiftVal) {
7720 Mask.append(NumElts, SM_SentinelZero);
7721 return true;
7722 }
7723
7724 // We can only decode 'whole byte' bit shifts as shuffles.
7725 if ((ShiftVal % 8) != 0)
7726 break;
7727
7728 uint64_t ByteShift = ShiftVal / 8;
7729 Ops.push_back(N.getOperand(0));
7730
7731 // Clear mask to all zeros and insert the shifted byte indices.
7732 Mask.append(NumSizeInBytes, SM_SentinelZero);
7733
7734 if (X86ISD::VSHLI == Opcode) {
7735 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7736 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7737 Mask[i + j] = i + j - ByteShift;
7738 } else {
7739 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7740 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7741 Mask[i + j - ByteShift] = i + j;
7742 }
7743 return true;
7744 }
7745 case X86ISD::VROTLI:
7746 case X86ISD::VROTRI: {
7747 // We can only decode 'whole byte' bit rotates as shuffles.
7748 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7749 if ((RotateVal % 8) != 0)
7750 return false;
7751 Ops.push_back(N.getOperand(0));
7752 int Offset = RotateVal / 8;
7753 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
7754 for (int i = 0; i != (int)NumElts; ++i) {
7755 int BaseIdx = i * NumBytesPerElt;
7756 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
7757 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
7758 }
7759 }
7760 return true;
7761 }
7762 case X86ISD::VBROADCAST: {
7763 SDValue Src = N.getOperand(0);
7764 if (!Src.getSimpleValueType().isVector())
7765 return false;
7766 Ops.push_back(Src);
7767 Mask.append(NumElts, 0);
7768 return true;
7769 }
7770 case ISD::ZERO_EXTEND:
7771 case ISD::ANY_EXTEND:
7772 case ISD::ZERO_EXTEND_VECTOR_INREG:
7773 case ISD::ANY_EXTEND_VECTOR_INREG: {
7774 SDValue Src = N.getOperand(0);
7775 EVT SrcVT = Src.getValueType();
7776
7777 // Extended source must be a simple vector.
7778 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7779 (SrcVT.getScalarSizeInBits() % 8) != 0)
7780 return false;
7781
7782 bool IsAnyExtend =
7783 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7784 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
7785 IsAnyExtend, Mask);
7786 Ops.push_back(Src);
7787 return true;
7788 }
7789 }
7790
7791 return false;
7792}
7793
7794/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7795static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
7796 SmallVectorImpl<int> &Mask) {
7797 int MaskWidth = Mask.size();
7798 SmallVector<SDValue, 16> UsedInputs;
7799 for (int i = 0, e = Inputs.size(); i < e; ++i) {
7800 int lo = UsedInputs.size() * MaskWidth;
7801 int hi = lo + MaskWidth;
7802
7803 // Strip UNDEF input usage.
7804 if (Inputs[i].isUndef())
7805 for (int &M : Mask)
7806 if ((lo <= M) && (M < hi))
7807 M = SM_SentinelUndef;
7808
7809 // Check for unused inputs.
7810 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7811 for (int &M : Mask)
7812 if (lo <= M)
7813 M -= MaskWidth;
7814 continue;
7815 }
7816
7817 // Check for repeated inputs.
7818 bool IsRepeat = false;
7819 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7820 if (UsedInputs[j] != Inputs[i])
7821 continue;
7822 for (int &M : Mask)
7823 if (lo <= M)
7824 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7825 IsRepeat = true;
7826 break;
7827 }
7828 if (IsRepeat)
7829 continue;
7830
7831 UsedInputs.push_back(Inputs[i]);
7832 }
7833 Inputs = UsedInputs;
7834}
7835
7836/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7837/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7838/// Returns true if the target shuffle mask was decoded.
7839static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7840 SmallVectorImpl<SDValue> &Inputs,
7841 SmallVectorImpl<int> &Mask,
7842 APInt &KnownUndef, APInt &KnownZero,
7843 const SelectionDAG &DAG, unsigned Depth,
7844 bool ResolveKnownElts) {
7845 EVT VT = Op.getValueType();
7846 if (!VT.isSimple() || !VT.isVector())
25
Calling 'EVT::isSimple'
27
Returning from 'EVT::isSimple'
28
Calling 'EVT::isVector'
34
Returning from 'EVT::isVector'
35
Taking false branch
7847 return false;
7848
7849 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
36
Value assigned to 'OpMask.Size'
37
Assuming the condition is true
38
Taking true branch
7850 if (ResolveKnownElts
38.1
'ResolveKnownElts' is false
38.1
'ResolveKnownElts' is false
38.1
'ResolveKnownElts' is false
38.1
'ResolveKnownElts' is false
38.1
'ResolveKnownElts' is false
38.1
'ResolveKnownElts' is false
)
39
Taking false branch
7851 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7852 return true;
40
Returning the value 1, which participates in a condition later
7853 }
7854 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7855 ResolveKnownElts)) {
7856 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7857 return true;
7858 }
7859 return false;
7860}
7861
7862static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7863 SmallVectorImpl<int> &Mask,
7864 const SelectionDAG &DAG, unsigned Depth = 0,
7865 bool ResolveKnownElts = true) {
7866 EVT VT = Op.getValueType();
7867 if (!VT.isSimple() || !VT.isVector())
7868 return false;
7869
7870 APInt KnownUndef, KnownZero;
7871 unsigned NumElts = Op.getValueType().getVectorNumElements();
7872 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
7873 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7874 KnownZero, DAG, Depth, ResolveKnownElts);
7875}
7876
7877/// Returns the scalar element that will make up the i'th
7878/// element of the result of the vector shuffle.
7879static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
7880 SelectionDAG &DAG, unsigned Depth) {
7881 if (Depth >= SelectionDAG::MaxRecursionDepth)
7882 return SDValue(); // Limit search depth.
7883
7884 EVT VT = Op.getValueType();
7885 unsigned Opcode = Op.getOpcode();
7886 unsigned NumElems = VT.getVectorNumElements();
7887
7888 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
7889 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
7890 int Elt = SV->getMaskElt(Index);
7891
7892 if (Elt < 0)
7893 return DAG.getUNDEF(VT.getVectorElementType());
7894
7895 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
7896 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
7897 }
7898
7899 // Recurse into target specific vector shuffles to find scalars.
7900 if (isTargetShuffle(Opcode)) {
7901 MVT ShufVT = VT.getSimpleVT();
7902 MVT ShufSVT = ShufVT.getVectorElementType();
7903 int NumElems = (int)ShufVT.getVectorNumElements();
7904 SmallVector<int, 16> ShuffleMask;
7905 SmallVector<SDValue, 16> ShuffleOps;
7906 bool IsUnary;
7907
7908 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
7909 ShuffleMask, IsUnary))
7910 return SDValue();
7911
7912 int Elt = ShuffleMask[Index];
7913 if (Elt == SM_SentinelZero)
7914 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
7915 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
7916 if (Elt == SM_SentinelUndef)
7917 return DAG.getUNDEF(ShufSVT);
7918
7919 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")((0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range"
) ? static_cast<void> (0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7919, __PRETTY_FUNCTION__))
;
7920 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
7921 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
7922 }
7923
7924 // Recurse into insert_subvector base/sub vector to find scalars.
7925 if (Opcode == ISD::INSERT_SUBVECTOR) {
7926 SDValue Vec = Op.getOperand(0);
7927 SDValue Sub = Op.getOperand(1);
7928 uint64_t SubIdx = Op.getConstantOperandVal(2);
7929 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
7930
7931 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
7932 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
7933 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
7934 }
7935
7936 // Recurse into concat_vectors sub vector to find scalars.
7937 if (Opcode == ISD::CONCAT_VECTORS) {
7938 EVT SubVT = Op.getOperand(0).getValueType();
7939 unsigned NumSubElts = SubVT.getVectorNumElements();
7940 uint64_t SubIdx = Index / NumSubElts;
7941 uint64_t SubElt = Index % NumSubElts;
7942 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
7943 }
7944
7945 // Recurse into extract_subvector src vector to find scalars.
7946 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
7947 SDValue Src = Op.getOperand(0);
7948 uint64_t SrcIdx = Op.getConstantOperandVal(1);
7949 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
7950 }
7951
7952 // We only peek through bitcasts of the same vector width.
7953 if (Opcode == ISD::BITCAST) {
7954 SDValue Src = Op.getOperand(0);
7955 EVT SrcVT = Src.getValueType();
7956 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
7957 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
7958 return SDValue();
7959 }
7960
7961 // Actual nodes that may contain scalar elements
7962
7963 // For insert_vector_elt - either return the index matching scalar or recurse
7964 // into the base vector.
7965 if (Opcode == ISD::INSERT_VECTOR_ELT &&
7966 isa<ConstantSDNode>(Op.getOperand(2))) {
7967 if (Op.getConstantOperandAPInt(2) == Index)
7968 return Op.getOperand(1);
7969 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
7970 }
7971
7972 if (Opcode == ISD::SCALAR_TO_VECTOR)
7973 return (Index == 0) ? Op.getOperand(0)
7974 : DAG.getUNDEF(VT.getVectorElementType());
7975
7976 if (Opcode == ISD::BUILD_VECTOR)
7977 return Op.getOperand(Index);
7978
7979 return SDValue();
7980}
7981
7982// Use PINSRB/PINSRW/PINSRD to create a build vector.
7983static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
7984 unsigned NumNonZero, unsigned NumZero,
7985 SelectionDAG &DAG,
7986 const X86Subtarget &Subtarget) {
7987 MVT VT = Op.getSimpleValueType();
7988 unsigned NumElts = VT.getVectorNumElements();
7989 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7991, __PRETTY_FUNCTION__))
7990 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7991, __PRETTY_FUNCTION__))
7991 "Illegal vector insertion")((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7991, __PRETTY_FUNCTION__))
;
7992
7993 SDLoc dl(Op);
7994 SDValue V;
7995 bool First = true;
7996
7997 for (unsigned i = 0; i < NumElts; ++i) {
7998 bool IsNonZero = (NonZeros & (1 << i)) != 0;
7999 if (!IsNonZero)
8000 continue;
8001
8002 // If the build vector contains zeros or our first insertion is not the
8003 // first index then insert into zero vector to break any register
8004 // dependency else use SCALAR_TO_VECTOR.
8005 if (First) {
8006 First = false;
8007 if (NumZero || 0 != i)
8008 V = getZeroVector(VT, Subtarget, DAG, dl);
8009 else {
8010 assert(0 == i && "Expected insertion into zero-index")((0 == i && "Expected insertion into zero-index") ? static_cast
<void> (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8010, __PRETTY_FUNCTION__))
;
8011 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8012 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8013 V = DAG.getBitcast(VT, V);
8014 continue;
8015 }
8016 }
8017 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8018 DAG.getIntPtrConstant(i, dl));
8019 }
8020
8021 return V;
8022}
8023
8024/// Custom lower build_vector of v16i8.
8025static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
8026 unsigned NumNonZero, unsigned NumZero,
8027 SelectionDAG &DAG,
8028 const X86Subtarget &Subtarget) {
8029 if (NumNonZero > 8 && !Subtarget.hasSSE41())
8030 return SDValue();
8031
8032 // SSE4.1 - use PINSRB to insert each byte directly.
8033 if (Subtarget.hasSSE41())
8034 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
8035 Subtarget);
8036
8037 SDLoc dl(Op);
8038 SDValue V;
8039
8040 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8041 for (unsigned i = 0; i < 16; i += 2) {
8042 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
8043 bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
8044 if (!ThisIsNonZero && !NextIsNonZero)
8045 continue;
8046
8047 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8048 SDValue Elt;
8049 if (ThisIsNonZero) {
8050 if (NumZero || NextIsNonZero)
8051 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8052 else
8053 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8054 }
8055
8056 if (NextIsNonZero) {
8057 SDValue NextElt = Op.getOperand(i + 1);
8058 if (i == 0 && NumZero)
8059 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8060 else
8061 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8062 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8063 DAG.getConstant(8, dl, MVT::i8));
8064 if (ThisIsNonZero)
8065 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8066 else
8067 Elt = NextElt;
8068 }
8069
8070 // If our first insertion is not the first index or zeros are needed, then
8071 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8072 // elements undefined).
8073 if (!V) {
8074 if (i != 0 || NumZero)
8075 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8076 else {
8077 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8078 V = DAG.getBitcast(MVT::v8i16, V);
8079 continue;
8080 }
8081 }
8082 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8083 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8084 DAG.getIntPtrConstant(i / 2, dl));
8085 }
8086
8087 return DAG.getBitcast(MVT::v16i8, V);
8088}
8089
8090/// Custom lower build_vector of v8i16.
8091static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
8092 unsigned NumNonZero, unsigned NumZero,
8093 SelectionDAG &DAG,
8094 const X86Subtarget &Subtarget) {
8095 if (NumNonZero > 4 && !Subtarget.hasSSE41())
8096 return SDValue();
8097
8098 // Use PINSRW to insert each byte directly.
8099 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
8100 Subtarget);
8101}
8102
8103/// Custom lower build_vector of v4i32 or v4f32.
8104static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8105 const X86Subtarget &Subtarget) {
8106 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8107 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8108 // Because we're creating a less complicated build vector here, we may enable
8109 // further folding of the MOVDDUP via shuffle transforms.
8110 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8111 Op.getOperand(0) == Op.getOperand(2) &&
8112 Op.getOperand(1) == Op.getOperand(3) &&
8113 Op.getOperand(0) != Op.getOperand(1)) {
8114 SDLoc DL(Op);
8115 MVT VT = Op.getSimpleValueType();
8116 MVT EltVT = VT.getVectorElementType();
8117 // Create a new build vector with the first 2 elements followed by undef
8118 // padding, bitcast to v2f64, duplicate, and bitcast back.
8119 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8120 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8121 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8122 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8123 return DAG.getBitcast(VT, Dup);
8124 }
8125
8126 // Find all zeroable elements.
8127 std::bitset<4> Zeroable, Undefs;
8128 for (int i = 0; i < 4; ++i) {
8129 SDValue Elt = Op.getOperand(i);
8130 Undefs[i] = Elt.isUndef();
8131 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8132 }
8133 assert(Zeroable.size() - Zeroable.count() > 1 &&((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8134, __PRETTY_FUNCTION__))
8134 "We expect at least two non-zero elements!")((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8134, __PRETTY_FUNCTION__))
;
8135
8136 // We only know how to deal with build_vector nodes where elements are either
8137 // zeroable or extract_vector_elt with constant index.
8138 SDValue FirstNonZero;
8139 unsigned FirstNonZeroIdx;
8140 for (unsigned i = 0; i < 4; ++i) {
8141 if (Zeroable[i])
8142 continue;
8143 SDValue Elt = Op.getOperand(i);
8144 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8145 !isa<ConstantSDNode>(Elt.getOperand(1)))
8146 return SDValue();
8147 // Make sure that this node is extracting from a 128-bit vector.
8148 MVT VT = Elt.getOperand(0).getSimpleValueType();
8149 if (!VT.is128BitVector())
8150 return SDValue();
8151 if (!FirstNonZero.getNode()) {
8152 FirstNonZero = Elt;
8153 FirstNonZeroIdx = i;
8154 }
8155 }
8156
8157 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")((FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? static_cast<void> (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8157, __PRETTY_FUNCTION__))
;
8158 SDValue V1 = FirstNonZero.getOperand(0);
8159 MVT VT = V1.getSimpleValueType();
8160
8161 // See if this build_vector can be lowered as a blend with zero.
8162 SDValue Elt;
8163 unsigned EltMaskIdx, EltIdx;
8164 int Mask[4];
8165 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8166 if (Zeroable[EltIdx]) {
8167 // The zero vector will be on the right hand side.
8168 Mask[EltIdx] = EltIdx+4;
8169 continue;
8170 }
8171
8172 Elt = Op->getOperand(EltIdx);
8173 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8174 EltMaskIdx = Elt.getConstantOperandVal(1);
8175 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8176 break;
8177 Mask[EltIdx] = EltIdx;
8178 }
8179
8180 if (EltIdx == 4) {
8181 // Let the shuffle legalizer deal with blend operations.
8182 SDValue VZeroOrUndef = (Zeroable == Undefs)
8183 ? DAG.getUNDEF(VT)
8184 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8185 if (V1.getSimpleValueType() != VT)
8186 V1 = DAG.getBitcast(VT, V1);
8187 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8188 }
8189
8190 // See if we can lower this build_vector to a INSERTPS.
8191 if (!Subtarget.hasSSE41())
8192 return SDValue();
8193
8194 SDValue V2 = Elt.getOperand(0);
8195 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8196 V1 = SDValue();
8197
8198 bool CanFold = true;
8199 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8200 if (Zeroable[i])
8201 continue;
8202
8203 SDValue Current = Op->getOperand(i);
8204 SDValue SrcVector = Current->getOperand(0);
8205 if (!V1.getNode())
8206 V1 = SrcVector;
8207 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8208 }
8209
8210 if (!CanFold)
8211 return SDValue();
8212
8213 assert(V1.getNode() && "Expected at least two non-zero elements!")((V1.getNode() && "Expected at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8213, __PRETTY_FUNCTION__))
;
8214 if (V1.getSimpleValueType() != MVT::v4f32)
8215 V1 = DAG.getBitcast(MVT::v4f32, V1);
8216 if (V2.getSimpleValueType() != MVT::v4f32)
8217 V2 = DAG.getBitcast(MVT::v4f32, V2);
8218
8219 // Ok, we can emit an INSERTPS instruction.
8220 unsigned ZMask = Zeroable.to_ulong();
8221
8222 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8223 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8223, __PRETTY_FUNCTION__))
;
8224 SDLoc DL(Op);
8225 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8226 DAG.getIntPtrConstant(InsertPSMask, DL, true));
8227 return DAG.getBitcast(VT, Result);
8228}
8229
8230/// Return a vector logical shift node.
8231static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8232 SelectionDAG &DAG, const TargetLowering &TLI,
8233 const SDLoc &dl) {
8234 assert(VT.is128BitVector() && "Unknown type for VShift")((VT.is128BitVector() && "Unknown type for VShift") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8234, __PRETTY_FUNCTION__))
;
8235 MVT ShVT = MVT::v16i8;
8236 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8237 SrcOp = DAG.getBitcast(ShVT, SrcOp);
8238 assert(NumBits % 8 == 0 && "Only support byte sized shifts")((NumBits % 8 == 0 && "Only support byte sized shifts"
) ? static_cast<void> (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8238, __PRETTY_FUNCTION__))
;
8239 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8240 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8241}
8242
8243static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8244 SelectionDAG &DAG) {
8245
8246 // Check if the scalar load can be widened into a vector load. And if
8247 // the address is "base + cst" see if the cst can be "absorbed" into
8248 // the shuffle mask.
8249 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8250 SDValue Ptr = LD->getBasePtr();
8251 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8252 return SDValue();
8253 EVT PVT = LD->getValueType(0);
8254 if (PVT != MVT::i32 && PVT != MVT::f32)
8255 return SDValue();
8256
8257 int FI = -1;
8258 int64_t Offset = 0;
8259 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8260 FI = FINode->getIndex();
8261 Offset = 0;
8262 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8263 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8264 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8265 Offset = Ptr.getConstantOperandVal(1);
8266 Ptr = Ptr.getOperand(0);
8267 } else {
8268 return SDValue();
8269 }
8270
8271 // FIXME: 256-bit vector instructions don't require a strict alignment,
8272 // improve this code to support it better.
8273 Align RequiredAlign(VT.getSizeInBits() / 8);
8274 SDValue Chain = LD->getChain();
8275 // Make sure the stack object alignment is at least 16 or 32.
8276 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8277 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8278 if (!InferredAlign || *InferredAlign < RequiredAlign) {
8279 if (MFI.isFixedObjectIndex(FI)) {
8280 // Can't change the alignment. FIXME: It's possible to compute
8281 // the exact stack offset and reference FI + adjust offset instead.
8282 // If someone *really* cares about this. That's the way to implement it.
8283 return SDValue();
8284 } else {
8285 MFI.setObjectAlignment(FI, RequiredAlign);
8286 }
8287 }
8288
8289 // (Offset % 16 or 32) must be multiple of 4. Then address is then
8290 // Ptr + (Offset & ~15).
8291 if (Offset < 0)
8292 return SDValue();
8293 if ((Offset % RequiredAlign.value()) & 3)
8294 return SDValue();
8295 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8296 if (StartOffset) {
8297 SDLoc DL(Ptr);
8298 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8299 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8300 }
8301
8302 int EltNo = (Offset - StartOffset) >> 2;
8303 unsigned NumElems = VT.getVectorNumElements();
8304
8305 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8306 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8307 LD->getPointerInfo().getWithOffset(StartOffset));
8308
8309 SmallVector<int, 8> Mask(NumElems, EltNo);
8310
8311 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8312 }
8313
8314 return SDValue();
8315}
8316
8317// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8318static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8319 if (ISD::isNON_EXTLoad(Elt.getNode())) {
8320 auto *BaseLd = cast<LoadSDNode>(Elt);
8321 if (!BaseLd->isSimple())
8322 return false;
8323 Ld = BaseLd;
8324 ByteOffset = 0;
8325 return true;
8326 }
8327
8328 switch (Elt.getOpcode()) {
8329 case ISD::BITCAST:
8330 case ISD::TRUNCATE:
8331 case ISD::SCALAR_TO_VECTOR:
8332 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8333 case ISD::SRL:
8334 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8335 uint64_t Idx = IdxC->getZExtValue();
8336 if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8337 ByteOffset += Idx / 8;
8338 return true;
8339 }
8340 }
8341 break;
8342 case ISD::EXTRACT_VECTOR_ELT:
8343 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8344 SDValue Src = Elt.getOperand(0);
8345 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8346 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8347 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8348 findEltLoadSrc(Src, Ld, ByteOffset)) {
8349 uint64_t Idx = IdxC->getZExtValue();
8350 ByteOffset += Idx * (SrcSizeInBits / 8);
8351 return true;
8352 }
8353 }
8354 break;
8355 }
8356
8357 return false;
8358}
8359
8360/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8361/// elements can be replaced by a single large load which has the same value as
8362/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8363///
8364/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8365static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8366 const SDLoc &DL, SelectionDAG &DAG,
8367 const X86Subtarget &Subtarget,
8368 bool isAfterLegalize) {
8369 if ((VT.getScalarSizeInBits() % 8) != 0)
8370 return SDValue();
8371
8372 unsigned NumElems = Elts.size();
8373
8374 int LastLoadedElt = -1;
8375 APInt LoadMask = APInt::getNullValue(NumElems);
8376 APInt ZeroMask = APInt::getNullValue(NumElems);
8377 APInt UndefMask = APInt::getNullValue(NumElems);
8378
8379 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8380 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8381
8382 // For each element in the initializer, see if we've found a load, zero or an
8383 // undef.
8384 for (unsigned i = 0; i < NumElems; ++i) {
8385 SDValue Elt = peekThroughBitcasts(Elts[i]);
8386 if (!Elt.getNode())
8387 return SDValue();
8388 if (Elt.isUndef()) {
8389 UndefMask.setBit(i);
8390 continue;
8391 }
8392 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8393 ZeroMask.setBit(i);
8394 continue;
8395 }
8396
8397 // Each loaded element must be the correct fractional portion of the
8398 // requested vector load.
8399 unsigned EltSizeInBits = Elt.getValueSizeInBits();
8400 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8401 return SDValue();
8402
8403 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8404 return SDValue();
8405 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8406 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8407 return SDValue();
8408
8409 LoadMask.setBit(i);
8410 LastLoadedElt = i;
8411 }
8412 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8414, __PRETTY_FUNCTION__))
8413 LoadMask.countPopulation()) == NumElems &&(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8414, __PRETTY_FUNCTION__))
8414 "Incomplete element masks")(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8414, __PRETTY_FUNCTION__))
;
8415
8416 // Handle Special Cases - all undef or undef/zero.
8417 if (UndefMask.countPopulation() == NumElems)
8418 return DAG.getUNDEF(VT);
8419 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8420 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8421 : DAG.getConstantFP(0.0, DL, VT);
8422
8423 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8424 int FirstLoadedElt = LoadMask.countTrailingZeros();
8425 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8426 EVT EltBaseVT = EltBase.getValueType();
8427 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&((EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits()
&& "Register/Memory size mismatch") ? static_cast<
void> (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8428, __PRETTY_FUNCTION__))
8428 "Register/Memory size mismatch")((EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits()
&& "Register/Memory size mismatch") ? static_cast<
void> (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8428, __PRETTY_FUNCTION__))
;
8429 LoadSDNode *LDBase = Loads[FirstLoadedElt];
8430 assert(LDBase && "Did not find base load for merging consecutive loads")((LDBase && "Did not find base load for merging consecutive loads"
) ? static_cast<void> (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8430, __PRETTY_FUNCTION__))
;
8431 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8432 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8433 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
8434 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
8435 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"
) ? static_cast<void> (0) : __assert_fail ("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8435, __PRETTY_FUNCTION__))
;
8436
8437 // TODO: Support offsetting the base load.
8438 if (ByteOffsets[FirstLoadedElt] != 0)
8439 return SDValue();
8440
8441 // Check to see if the element's load is consecutive to the base load
8442 // or offset from a previous (already checked) load.
8443 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8444 LoadSDNode *Ld = Loads[EltIdx];
8445 int64_t ByteOffset = ByteOffsets[EltIdx];
8446 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8447 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8448 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8449 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8450 }
8451 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8452 EltIdx - FirstLoadedElt);
8453 };
8454
8455 // Consecutive loads can contain UNDEFS but not ZERO elements.
8456 // Consecutive loads with UNDEFs and ZEROs elements require a
8457 // an additional shuffle stage to clear the ZERO elements.
8458 bool IsConsecutiveLoad = true;
8459 bool IsConsecutiveLoadWithZeros = true;
8460 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8461 if (LoadMask[i]) {
8462 if (!CheckConsecutiveLoad(LDBase, i)) {
8463 IsConsecutiveLoad = false;
8464 IsConsecutiveLoadWithZeros = false;
8465 break;
8466 }
8467 } else if (ZeroMask[i]) {
8468 IsConsecutiveLoad = false;
8469 }
8470 }
8471
8472 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8473 auto MMOFlags = LDBase->getMemOperand()->getFlags();
8474 assert(LDBase->isSimple() &&((LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? static_cast<void> (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8475, __PRETTY_FUNCTION__))
8475 "Cannot merge volatile or atomic loads.")((LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? static_cast<void> (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8475, __PRETTY_FUNCTION__))
;
8476 SDValue NewLd =
8477 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8478 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
8479 MMOFlags);
8480 for (auto *LD : Loads)
8481 if (LD)
8482 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8483 return NewLd;
8484 };
8485
8486 // Check if the base load is entirely dereferenceable.
8487 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8488 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8489
8490 // LOAD - all consecutive load/undefs (must start/end with a load or be
8491 // entirely dereferenceable). If we have found an entire vector of loads and
8492 // undefs, then return a large load of the entire vector width starting at the
8493 // base pointer. If the vector contains zeros, then attempt to shuffle those
8494 // elements.
8495 if (FirstLoadedElt == 0 &&
8496 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
8497 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8498 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8499 return SDValue();
8500
8501 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8502 // will lower to regular temporal loads and use the cache.
8503 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8504 VT.is256BitVector() && !Subtarget.hasInt256())
8505 return SDValue();
8506
8507 if (NumElems == 1)
8508 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8509
8510 if (!ZeroMask)
8511 return CreateLoad(VT, LDBase);
8512
8513 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8514 // vector and a zero vector to clear out the zero elements.
8515 if (!isAfterLegalize && VT.isVector()) {
8516 unsigned NumMaskElts = VT.getVectorNumElements();
8517 if ((NumMaskElts % NumElems) == 0) {
8518 unsigned Scale = NumMaskElts / NumElems;
8519 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8520 for (unsigned i = 0; i < NumElems; ++i) {
8521 if (UndefMask[i])
8522 continue;
8523 int Offset = ZeroMask[i] ? NumMaskElts : 0;
8524 for (unsigned j = 0; j != Scale; ++j)
8525 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8526 }
8527 SDValue V = CreateLoad(VT, LDBase);
8528 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8529 : DAG.getConstantFP(0.0, DL, VT);
8530 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8531 }
8532 }
8533 }
8534
8535 // If the upper half of a ymm/zmm load is undef then just load the lower half.
8536 if (VT.is256BitVector() || VT.is512BitVector()) {
8537 unsigned HalfNumElems = NumElems / 2;
8538 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8539 EVT HalfVT =
8540 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8541 SDValue HalfLD =
8542 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8543 DAG, Subtarget, isAfterLegalize);
8544 if (HalfLD)
8545 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8546 HalfLD, DAG.getIntPtrConstant(0, DL));
8547 }
8548 }
8549
8550 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8551 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8552 (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
8553 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8554 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8555 : MVT::getIntegerVT(LoadSizeInBits);
8556 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8557 // Allow v4f32 on SSE1 only targets.
8558 // FIXME: Add more isel patterns so we can just use VT directly.
8559 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8560 VecVT = MVT::v4f32;
8561 if (TLI.isTypeLegal(VecVT)) {
8562 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8563 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8564 SDValue ResNode = DAG.getMemIntrinsicNode(
8565 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
8566 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
8567 for (auto *LD : Loads)
8568 if (LD)
8569 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8570 return DAG.getBitcast(VT, ResNode);
8571 }
8572 }
8573
8574 // BROADCAST - match the smallest possible repetition pattern, load that
8575 // scalar/subvector element and then broadcast to the entire vector.
8576 if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8577 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8578 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8579 unsigned RepeatSize = SubElems * BaseSizeInBits;
8580 unsigned ScalarSize = std::min(RepeatSize, 64u);
8581 if (!Subtarget.hasAVX2() && ScalarSize < 32)
8582 continue;
8583
8584 bool Match = true;
8585 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8586 for (unsigned i = 0; i != NumElems && Match; ++i) {
8587 if (!LoadMask[i])
8588 continue;
8589 SDValue Elt = peekThroughBitcasts(Elts[i]);
8590 if (RepeatedLoads[i % SubElems].isUndef())
8591 RepeatedLoads[i % SubElems] = Elt;
8592 else
8593 Match &= (RepeatedLoads[i % SubElems] == Elt);
8594 }
8595
8596 // We must have loads at both ends of the repetition.
8597 Match &= !RepeatedLoads.front().isUndef();
8598 Match &= !RepeatedLoads.back().isUndef();
8599 if (!Match)
8600 continue;
8601
8602 EVT RepeatVT =
8603 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8604 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8605 : EVT::getFloatingPointVT(ScalarSize);
8606 if (RepeatSize > ScalarSize)
8607 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8608 RepeatSize / ScalarSize);
8609 EVT BroadcastVT =
8610 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8611 VT.getSizeInBits() / ScalarSize);
8612 if (TLI.isTypeLegal(BroadcastVT)) {
8613 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8614 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
8615 unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
8616 : X86ISD::VBROADCAST;
8617 SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
8618 return DAG.getBitcast(VT, Broadcast);
8619 }
8620 }
8621 }
8622 }
8623
8624 return SDValue();
8625}
8626
8627// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8628// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8629// are consecutive, non-overlapping, and in the right order.
8630static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
8631 SelectionDAG &DAG,
8632 const X86Subtarget &Subtarget,
8633 bool isAfterLegalize) {
8634 SmallVector<SDValue, 64> Elts;
8635 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8636 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
8637 Elts.push_back(Elt);
8638 continue;
8639 }
8640 return SDValue();
8641 }
8642 assert(Elts.size() == VT.getVectorNumElements())((Elts.size() == VT.getVectorNumElements()) ? static_cast<
void> (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8642, __PRETTY_FUNCTION__))
;
8643 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8644 isAfterLegalize);
8645}
8646
8647static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8648 unsigned SplatBitSize, LLVMContext &C) {
8649 unsigned ScalarSize = VT.getScalarSizeInBits();
8650 unsigned NumElm = SplatBitSize / ScalarSize;
8651
8652 SmallVector<Constant *, 32> ConstantVec;
8653 for (unsigned i = 0; i < NumElm; i++) {
8654 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
8655 Constant *Const;
8656 if (VT.isFloatingPoint()) {
8657 if (ScalarSize == 32) {
8658 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8659 } else {
8660 assert(ScalarSize == 64 && "Unsupported floating point scalar size")((ScalarSize == 64 && "Unsupported floating point scalar size"
) ? static_cast<void> (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8660, __PRETTY_FUNCTION__))
;
8661 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8662 }
8663 } else
8664 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8665 ConstantVec.push_back(Const);
8666 }
8667 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8668}
8669
8670static bool isFoldableUseOfShuffle(SDNode *N) {
8671 for (auto *U : N->uses()) {
8672 unsigned Opc = U->getOpcode();
8673 // VPERMV/VPERMV3 shuffles can never fold their index operands.
8674 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8675 return false;
8676 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8677 return false;
8678 if (isTargetShuffle(Opc))
8679 return true;
8680 if (Opc == ISD::BITCAST) // Ignore bitcasts
8681 return isFoldableUseOfShuffle(U);
8682 if (N->hasOneUse())
8683 return true;
8684 }
8685 return false;
8686}
8687
8688/// Attempt to use the vbroadcast instruction to generate a splat value
8689/// from a splat BUILD_VECTOR which uses:
8690/// a. A single scalar load, or a constant.
8691/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8692///
8693/// The VBROADCAST node is returned when a pattern is found,
8694/// or SDValue() otherwise.
8695static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
8696 const X86Subtarget &Subtarget,
8697 SelectionDAG &DAG) {
8698 // VBROADCAST requires AVX.
8699 // TODO: Splats could be generated for non-AVX CPUs using SSE
8700 // instructions, but there's less potential gain for only 128-bit vectors.
8701 if (!Subtarget.hasAVX())
8702 return SDValue();
8703
8704 MVT VT = BVOp->getSimpleValueType(0);
8705 unsigned NumElts = VT.getVectorNumElements();
8706 SDLoc dl(BVOp);
8707
8708 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8709, __PRETTY_FUNCTION__))
8709 "Unsupported vector type for broadcast.")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8709, __PRETTY_FUNCTION__))
;
8710
8711 // See if the build vector is a repeating sequence of scalars (inc. splat).
8712 SDValue Ld;
8713 BitVector UndefElements;
8714 SmallVector<SDValue, 16> Sequence;
8715 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
8716 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit."
) ? static_cast<void> (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8716, __PRETTY_FUNCTION__))
;
8717 if (Sequence.size() == 1)
8718 Ld = Sequence[0];
8719 }
8720
8721 // Attempt to use VBROADCASTM
8722 // From this pattern:
8723 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8724 // b. t1 = (build_vector t0 t0)
8725 //
8726 // Create (VBROADCASTM v2i1 X)
8727 if (!Sequence.empty() && Subtarget.hasCDI()) {
8728 // If not a splat, are the upper sequence values zeroable?
8729 unsigned SeqLen = Sequence.size();
8730 bool UpperZeroOrUndef =
8731 SeqLen == 1 ||
8732 llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
8733 return !V || V.isUndef() || isNullConstant(V);
8734 });
8735 SDValue Op0 = Sequence[0];
8736 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
8737 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
8738 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
8739 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
8740 ? Op0.getOperand(0)
8741 : Op0.getOperand(0).getOperand(0);
8742 MVT MaskVT = BOperand.getSimpleValueType();
8743 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
8744 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
8745 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8746 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
8747 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
8748 unsigned Scale = 512 / VT.getSizeInBits();
8749 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
8750 }
8751 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8752 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
8753 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
8754 return DAG.getBitcast(VT, Bcst);
8755 }
8756 }
8757 }
8758
8759 unsigned NumUndefElts = UndefElements.count();
8760 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8761 APInt SplatValue, Undef;
8762 unsigned SplatBitSize;
8763 bool HasUndef;
8764 // Check if this is a repeated constant pattern suitable for broadcasting.
8765 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8766 SplatBitSize > VT.getScalarSizeInBits() &&
8767 SplatBitSize < VT.getSizeInBits()) {
8768 // Avoid replacing with broadcast when it's a use of a shuffle
8769 // instruction to preserve the present custom lowering of shuffles.
8770 if (isFoldableUseOfShuffle(BVOp))
8771 return SDValue();
8772 // replace BUILD_VECTOR with broadcast of the repeated constants.
8773 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8774 LLVMContext *Ctx = DAG.getContext();
8775 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8776 if (Subtarget.hasAVX()) {
8777 if (SplatBitSize == 32 || SplatBitSize == 64 ||
8778 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8779 // Splatted value can fit in one INTEGER constant in constant pool.
8780 // Load the constant and broadcast it.
8781 MVT CVT = MVT::getIntegerVT(SplatBitSize);
8782 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
8783 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
8784 SDValue CP = DAG.getConstantPool(C, PVT);
8785 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8786
8787 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8788 SDVTList Tys =
8789 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8790 SDValue Ops[] = {DAG.getEntryNode(), CP};
8791 MachinePointerInfo MPI =
8792 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8793 SDValue Brdcst = DAG.getMemIntrinsicNode(
8794 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
8795 MachineMemOperand::MOLoad);
8796 return DAG.getBitcast(VT, Brdcst);
8797 }
8798 if (SplatBitSize > 64) {
8799 // Load the vector of constants and broadcast it.
8800 MVT CVT = VT.getScalarType();
8801 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
8802 *Ctx);
8803 SDValue VCP = DAG.getConstantPool(VecC, PVT);
8804 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8805 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
8806 Ld = DAG.getLoad(
8807 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
8808 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
8809 Alignment);
8810 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
8811 }
8812 }
8813 }
8814
8815 // If we are moving a scalar into a vector (Ld must be set and all elements
8816 // but 1 are undef) and that operation is not obviously supported by
8817 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8818 // That's better than general shuffling and may eliminate a load to GPR and
8819 // move from scalar to vector register.
8820 if (!Ld || NumElts - NumUndefElts != 1)
8821 return SDValue();
8822 unsigned ScalarSize = Ld.getValueSizeInBits();
8823 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8824 return SDValue();
8825 }
8826
8827 bool ConstSplatVal =
8828 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8829 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8830
8831 // TODO: Handle broadcasts of non-constant sequences.
8832
8833 // Make sure that all of the users of a non-constant load are from the
8834 // BUILD_VECTOR node.
8835 // FIXME: Is the use count needed for non-constant, non-load case?
8836 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8837 return SDValue();
8838
8839 unsigned ScalarSize = Ld.getValueSizeInBits();
8840 bool IsGE256 = (VT.getSizeInBits() >= 256);
8841
8842 // When optimizing for size, generate up to 5 extra bytes for a broadcast
8843 // instruction to save 8 or more bytes of constant pool data.
8844 // TODO: If multiple splats are generated to load the same constant,
8845 // it may be detrimental to overall size. There needs to be a way to detect
8846 // that condition to know if this is truly a size win.
8847 bool OptForSize = DAG.shouldOptForSize();
8848
8849 // Handle broadcasting a single constant scalar from the constant pool
8850 // into a vector.
8851 // On Sandybridge (no AVX2), it is still better to load a constant vector
8852 // from the constant pool and not to broadcast it from a scalar.
8853 // But override that restriction when optimizing for size.
8854 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
8855 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
8856 EVT CVT = Ld.getValueType();
8857 assert(!CVT.isVector() && "Must not broadcast a vector type")((!CVT.isVector() && "Must not broadcast a vector type"
) ? static_cast<void> (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8857, __PRETTY_FUNCTION__))
;
8858
8859 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
8860 // For size optimization, also splat v2f64 and v2i64, and for size opt
8861 // with AVX2, also splat i8 and i16.
8862 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
8863 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8864 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
8865 const Constant *C = nullptr;
8866 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
8867 C = CI->getConstantIntValue();
8868 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
8869 C = CF->getConstantFPValue();
8870
8871 assert(C && "Invalid constant type")((C && "Invalid constant type") ? static_cast<void
> (0) : __assert_fail ("C && \"Invalid constant type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8871, __PRETTY_FUNCTION__))
;
8872
8873 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8874 SDValue CP =
8875 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
8876 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8877
8878 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8879 SDValue Ops[] = {DAG.getEntryNode(), CP};
8880 MachinePointerInfo MPI =
8881 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8882 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
8883 MPI, Alignment, MachineMemOperand::MOLoad);
8884 }
8885 }
8886
8887 // Handle AVX2 in-register broadcasts.
8888 if (!IsLoad && Subtarget.hasInt256() &&
8889 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
8890 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8891
8892 // The scalar source must be a normal load.
8893 if (!IsLoad)
8894 return SDValue();
8895
8896 // Make sure the non-chain result is only used by this build vector.
8897 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
8898 return SDValue();
8899
8900 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8901 (Subtarget.hasVLX() && ScalarSize == 64)) {
8902 auto *LN = cast<LoadSDNode>(Ld);
8903 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8904 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8905 SDValue BCast =
8906 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
8907 LN->getMemoryVT(), LN->getMemOperand());
8908 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8909 return BCast;
8910 }
8911
8912 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
8913 // double since there is no vbroadcastsd xmm
8914 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
8915 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
8916 auto *LN = cast<LoadSDNode>(Ld);
8917 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8918 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8919 SDValue BCast =
8920 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
8921 LN->getMemoryVT(), LN->getMemOperand());
8922 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8923 return BCast;
8924 }
8925
8926 // Unsupported broadcast.
8927 return SDValue();
8928}
8929
8930/// For an EXTRACT_VECTOR_ELT with a constant index return the real
8931/// underlying vector and index.
8932///
8933/// Modifies \p ExtractedFromVec to the real vector and returns the real
8934/// index.
8935static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
8936 SDValue ExtIdx) {
8937 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
8938 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
8939 return Idx;
8940
8941 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
8942 // lowered this:
8943 // (extract_vector_elt (v8f32 %1), Constant<6>)
8944 // to:
8945 // (extract_vector_elt (vector_shuffle<2,u,u,u>
8946 // (extract_subvector (v8f32 %0), Constant<4>),
8947 // undef)
8948 // Constant<0>)
8949 // In this case the vector is the extract_subvector expression and the index
8950 // is 2, as specified by the shuffle.
8951 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
8952 SDValue ShuffleVec = SVOp->getOperand(0);
8953 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
8954 assert(ShuffleVecVT.getVectorElementType() ==((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8955, __PRETTY_FUNCTION__))
8955 ExtractedFromVec.getSimpleValueType().getVectorElementType())((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8955, __PRETTY_FUNCTION__))
;
8956
8957 int ShuffleIdx = SVOp->getMaskElt(Idx);
8958 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
8959 ExtractedFromVec = ShuffleVec;
8960 return ShuffleIdx;
8961 }
8962 return Idx;
8963}
8964
8965static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
8966 MVT VT = Op.getSimpleValueType();
8967
8968 // Skip if insert_vec_elt is not supported.
8969 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8970 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
8971 return SDValue();
8972
8973 SDLoc DL(Op);
8974 unsigned NumElems = Op.getNumOperands();
8975
8976 SDValue VecIn1;
8977 SDValue VecIn2;
8978 SmallVector<unsigned, 4> InsertIndices;
8979 SmallVector<int, 8> Mask(NumElems, -1);
8980
8981 for (unsigned i = 0; i != NumElems; ++i) {
8982 unsigned Opc = Op.getOperand(i).getOpcode();
8983
8984 if (Opc == ISD::UNDEF)
8985 continue;
8986
8987 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
8988 // Quit if more than 1 elements need inserting.
8989 if (InsertIndices.size() > 1)
8990 return SDValue();
8991
8992 InsertIndices.push_back(i);
8993 continue;
8994 }
8995
8996 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
8997 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
8998
8999 // Quit if non-constant index.
9000 if (!isa<ConstantSDNode>(ExtIdx))
9001 return SDValue();
9002 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9003
9004 // Quit if extracted from vector of different type.
9005 if (ExtractedFromVec.getValueType() != VT)
9006 return SDValue();
9007
9008 if (!VecIn1.getNode())
9009 VecIn1 = ExtractedFromVec;
9010 else if (VecIn1 != ExtractedFromVec) {
9011 if (!VecIn2.getNode())
9012 VecIn2 = ExtractedFromVec;
9013 else if (VecIn2 != ExtractedFromVec)
9014 // Quit if more than 2 vectors to shuffle
9015 return SDValue();
9016 }
9017
9018 if (ExtractedFromVec == VecIn1)
9019 Mask[i] = Idx;
9020 else if (ExtractedFromVec == VecIn2)
9021 Mask[i] = Idx + NumElems;
9022 }
9023
9024 if (!VecIn1.getNode())
9025 return SDValue();
9026
9027 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9028 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9029
9030 for (unsigned Idx : InsertIndices)
9031 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9032 DAG.getIntPtrConstant(Idx, DL));
9033
9034 return NV;
9035}
9036
9037// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
9038static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9039 const X86Subtarget &Subtarget) {
9040
9041 MVT VT = Op.getSimpleValueType();
9042 assert((VT.getVectorElementType() == MVT::i1) &&(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9043, __PRETTY_FUNCTION__))
9043 "Unexpected type in LowerBUILD_VECTORvXi1!")(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9043, __PRETTY_FUNCTION__))
;
9044
9045 SDLoc dl(Op);
9046 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9047 ISD::isBuildVectorAllOnes(Op.getNode()))
9048 return Op;
9049
9050 uint64_t Immediate = 0;
9051 SmallVector<unsigned, 16> NonConstIdx;
9052 bool IsSplat = true;
9053 bool HasConstElts = false;
9054 int SplatIdx = -1;
9055 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9056 SDValue In = Op.getOperand(idx);
9057 if (In.isUndef())
9058 continue;
9059 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9060 Immediate |= (InC->getZExtValue() & 0x1) << idx;
9061 HasConstElts = true;
9062 } else {
9063 NonConstIdx.push_back(idx);
9064 }
9065 if (SplatIdx < 0)
9066 SplatIdx = idx;
9067 else if (In != Op.getOperand(SplatIdx))
9068 IsSplat = false;
9069 }
9070
9071 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9072 if (IsSplat) {
9073 // The build_vector allows the scalar element to be larger than the vector
9074 // element type. We need to mask it to use as a condition unless we know
9075 // the upper bits are zero.
9076 // FIXME: Use computeKnownBits instead of checking specific opcode?
9077 SDValue Cond = Op.getOperand(SplatIdx);
9078 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")((Cond.getValueType() == MVT::i8 && "Unexpected VT!")
? static_cast<void> (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9078, __PRETTY_FUNCTION__))
;
9079 if (Cond.getOpcode() != ISD::SETCC)
9080 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9081 DAG.getConstant(1, dl, MVT::i8));
9082
9083 // Perform the select in the scalar domain so we can use cmov.
9084 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9085 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9086 DAG.getAllOnesConstant(dl, MVT::i32),
9087 DAG.getConstant(0, dl, MVT::i32));
9088 Select = DAG.getBitcast(MVT::v32i1, Select);
9089 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9090 } else {
9091 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9092 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9093 DAG.getAllOnesConstant(dl, ImmVT),
9094 DAG.getConstant(0, dl, ImmVT));
9095 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9096 Select = DAG.getBitcast(VecVT, Select);
9097 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9098 DAG.getIntPtrConstant(0, dl));
9099 }
9100 }
9101
9102 // insert elements one by one
9103 SDValue DstVec;
9104 if (HasConstElts) {
9105 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9106 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9107 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9108 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9109 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9110 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9111 } else {
9112 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9113 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9114 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9115 DstVec = DAG.getBitcast(VecVT, Imm);
9116 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9117 DAG.getIntPtrConstant(0, dl));
9118 }
9119 } else
9120 DstVec = DAG.getUNDEF(VT);
9121
9122 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9123 unsigned InsertIdx = NonConstIdx[i];
9124 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9125 Op.getOperand(InsertIdx),
9126 DAG.getIntPtrConstant(InsertIdx, dl));
9127 }
9128 return DstVec;
9129}
9130
9131/// This is a helper function of LowerToHorizontalOp().
9132/// This function checks that the build_vector \p N in input implements a
9133/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9134/// may not match the layout of an x86 256-bit horizontal instruction.
9135/// In other words, if this returns true, then some extraction/insertion will
9136/// be required to produce a valid horizontal instruction.
9137///
9138/// Parameter \p Opcode defines the kind of horizontal operation to match.
9139/// For example, if \p Opcode is equal to ISD::ADD, then this function
9140/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9141/// is equal to ISD::SUB, then this function checks if this is a horizontal
9142/// arithmetic sub.
9143///
9144/// This function only analyzes elements of \p N whose indices are
9145/// in range [BaseIdx, LastIdx).
9146///
9147/// TODO: This function was originally used to match both real and fake partial
9148/// horizontal operations, but the index-matching logic is incorrect for that.
9149/// See the corrected implementation in isHopBuildVector(). Can we reduce this
9150/// code because it is only used for partial h-op matching now?
9151static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9152 SelectionDAG &DAG,
9153 unsigned BaseIdx, unsigned LastIdx,
9154 SDValue &V0, SDValue &V1) {
9155 EVT VT = N->getValueType(0);
9156 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")((VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9156, __PRETTY_FUNCTION__))
;
9157 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")((BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"
) ? static_cast<void> (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9157, __PRETTY_FUNCTION__))
;
9158 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9159, __PRETTY_FUNCTION__))
9159 "Invalid Vector in input!")((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9159, __PRETTY_FUNCTION__))
;
9160
9161 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9162 bool CanFold = true;
9163 unsigned ExpectedVExtractIdx = BaseIdx;
9164 unsigned NumElts = LastIdx - BaseIdx;
9165 V0 = DAG.getUNDEF(VT);
9166 V1 = DAG.getUNDEF(VT);
9167
9168 // Check if N implements a horizontal binop.
9169 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9170 SDValue Op = N->getOperand(i + BaseIdx);
9171
9172 // Skip UNDEFs.
9173 if (Op->isUndef()) {
9174 // Update the expected vector extract index.
9175 if (i * 2 == NumElts)
9176 ExpectedVExtractIdx = BaseIdx;
9177 ExpectedVExtractIdx += 2;
9178 continue;
9179 }
9180
9181 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9182
9183 if (!CanFold)
9184 break;
9185
9186 SDValue Op0 = Op.getOperand(0);
9187 SDValue Op1 = Op.getOperand(1);
9188
9189 // Try to match the following pattern:
9190 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9191 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9192 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9193 Op0.getOperand(0) == Op1.getOperand(0) &&
9194 isa<ConstantSDNode>(Op0.getOperand(1)) &&
9195 isa<ConstantSDNode>(Op1.getOperand(1)));
9196 if (!CanFold)
9197 break;
9198
9199 unsigned I0 = Op0.getConstantOperandVal(1);
9200 unsigned I1 = Op1.getConstantOperandVal(1);
9201
9202 if (i * 2 < NumElts) {
9203 if (V0.isUndef()) {
9204 V0 = Op0.getOperand(0);
9205 if (V0.getValueType() != VT)
9206 return false;
9207 }
9208 } else {
9209 if (V1.isUndef()) {
9210 V1 = Op0.getOperand(0);
9211 if (V1.getValueType() != VT)
9212 return false;
9213 }
9214 if (i * 2 == NumElts)
9215 ExpectedVExtractIdx = BaseIdx;
9216 }
9217
9218 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9219 if (I0 == ExpectedVExtractIdx)
9220 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9221 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9222 // Try to match the following dag sequence:
9223 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9224 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9225 } else
9226 CanFold = false;
9227
9228 ExpectedVExtractIdx += 2;
9229 }
9230
9231 return CanFold;
9232}
9233
9234/// Emit a sequence of two 128-bit horizontal add/sub followed by
9235/// a concat_vector.
9236///
9237/// This is a helper function of LowerToHorizontalOp().
9238/// This function expects two 256-bit vectors called V0 and V1.
9239/// At first, each vector is split into two separate 128-bit vectors.
9240/// Then, the resulting 128-bit vectors are used to implement two
9241/// horizontal binary operations.
9242///
9243/// The kind of horizontal binary operation is defined by \p X86Opcode.
9244///
9245/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9246/// the two new horizontal binop.
9247/// When Mode is set, the first horizontal binop dag node would take as input
9248/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9249/// horizontal binop dag node would take as input the lower 128-bit of V1
9250/// and the upper 128-bit of V1.
9251/// Example:
9252/// HADD V0_LO, V0_HI
9253/// HADD V1_LO, V1_HI
9254///
9255/// Otherwise, the first horizontal binop dag node takes as input the lower
9256/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9257/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9258/// Example:
9259/// HADD V0_LO, V1_LO
9260/// HADD V0_HI, V1_HI
9261///
9262/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9263/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9264/// the upper 128-bits of the result.
9265static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9266 const SDLoc &DL, SelectionDAG &DAG,
9267 unsigned X86Opcode, bool Mode,
9268 bool isUndefLO, bool isUndefHI) {
9269 MVT VT = V0.getSimpleValueType();
9270 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&((VT.is256BitVector() && VT == V1.getSimpleValueType(
) && "Invalid nodes in input!") ? static_cast<void
> (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9271, __PRETTY_FUNCTION__))
9271 "Invalid nodes in input!")((VT.is256BitVector() && VT == V1.getSimpleValueType(
) && "Invalid nodes in input!") ? static_cast<void
> (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9271, __PRETTY_FUNCTION__))
;
9272
9273 unsigned NumElts = VT.getVectorNumElements();
9274 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9275 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9276 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9277 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9278 MVT NewVT = V0_LO.getSimpleValueType();
9279
9280 SDValue LO = DAG.getUNDEF(NewVT);
9281 SDValue HI = DAG.getUNDEF(NewVT);
9282
9283 if (Mode) {
9284 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9285 if (!isUndefLO && !V0->isUndef())
9286 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9287 if (!isUndefHI && !V1->isUndef())
9288 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9289 } else {
9290 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9291 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9292 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9293
9294 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9295 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9296 }
9297
9298 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9299}
9300
9301/// Returns true iff \p BV builds a vector with the result equivalent to
9302/// the result of ADDSUB/SUBADD operation.
9303/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9304/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9305/// \p Opnd0 and \p Opnd1.
9306static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9307 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9308 SDValue &Opnd0, SDValue &Opnd1,
9309 unsigned &NumExtracts,
9310 bool &IsSubAdd) {
9311
9312 MVT VT = BV->getSimpleValueType(0);
9313 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9314 return false;
9315
9316 unsigned NumElts = VT.getVectorNumElements();
9317 SDValue InVec0 = DAG.getUNDEF(VT);
9318 SDValue InVec1 = DAG.getUNDEF(VT);
9319
9320 NumExtracts = 0;
9321
9322 // Odd-numbered elements in the input build vector are obtained from
9323 // adding/subtracting two integer/float elements.
9324 // Even-numbered elements in the input build vector are obtained from
9325 // subtracting/adding two integer/float elements.
9326 unsigned Opc[2] = {0, 0};
9327 for (unsigned i = 0, e = NumElts; i != e; ++i) {
9328 SDValue Op = BV->getOperand(i);
9329
9330 // Skip 'undef' values.
9331 unsigned Opcode = Op.getOpcode();
9332 if (Opcode == ISD::UNDEF)
9333 continue;
9334
9335 // Early exit if we found an unexpected opcode.
9336 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9337 return false;
9338
9339 SDValue Op0 = Op.getOperand(0);
9340 SDValue Op1 = Op.getOperand(1);
9341
9342 // Try to match the following pattern:
9343 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9344 // Early exit if we cannot match that sequence.
9345 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9346 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9347 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9348 Op0.getOperand(1) != Op1.getOperand(1))
9349 return false;
9350
9351 unsigned I0 = Op0.getConstantOperandVal(1);
9352 if (I0 != i)
9353 return false;
9354
9355 // We found a valid add/sub node, make sure its the same opcode as previous
9356 // elements for this parity.
9357 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9358 return false;
9359 Opc[i % 2] = Opcode;
9360
9361 // Update InVec0 and InVec1.
9362 if (InVec0.isUndef()) {
9363 InVec0 = Op0.getOperand(0);
9364 if (InVec0.getSimpleValueType() != VT)
9365 return false;
9366 }
9367 if (InVec1.isUndef()) {
9368 InVec1 = Op1.getOperand(0);
9369 if (InVec1.getSimpleValueType() != VT)
9370 return false;
9371 }
9372
9373 // Make sure that operands in input to each add/sub node always
9374 // come from a same pair of vectors.
9375 if (InVec0 != Op0.getOperand(0)) {
9376 if (Opcode == ISD::FSUB)
9377 return false;
9378
9379 // FADD is commutable. Try to commute the operands
9380 // and then test again.
9381 std::swap(Op0, Op1);
9382 if (InVec0 != Op0.getOperand(0))
9383 return false;
9384 }
9385
9386 if (InVec1 != Op1.getOperand(0))
9387 return false;
9388
9389 // Increment the number of extractions done.
9390 ++NumExtracts;
9391 }
9392
9393 // Ensure we have found an opcode for both parities and that they are
9394 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9395 // inputs are undef.
9396 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9397 InVec0.isUndef() || InVec1.isUndef())
9398 return false;
9399
9400 IsSubAdd = Opc[0] == ISD::FADD;
9401
9402 Opnd0 = InVec0;
9403 Opnd1 = InVec1;
9404 return true;
9405}
9406
9407/// Returns true if is possible to fold MUL and an idiom that has already been
9408/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9409/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9410/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9411///
9412/// Prior to calling this function it should be known that there is some
9413/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9414/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9415/// before replacement of such SDNode with ADDSUB operation. Thus the number
9416/// of \p Opnd0 uses is expected to be equal to 2.
9417/// For example, this function may be called for the following IR:
9418/// %AB = fmul fast <2 x double> %A, %B
9419/// %Sub = fsub fast <2 x double> %AB, %C
9420/// %Add = fadd fast <2 x double> %AB, %C
9421/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9422/// <2 x i32> <i32 0, i32 3>
9423/// There is a def for %Addsub here, which potentially can be replaced by
9424/// X86ISD::ADDSUB operation:
9425/// %Addsub = X86ISD::ADDSUB %AB, %C
9426/// and such ADDSUB can further be replaced with FMADDSUB:
9427/// %Addsub = FMADDSUB %A, %B, %C.
9428///
9429/// The main reason why this method is called before the replacement of the
9430/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9431/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9432/// FMADDSUB is.
9433static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9434 SelectionDAG &DAG,
9435 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9436 unsigned ExpectedUses) {
9437 if (Opnd0.getOpcode() != ISD::FMUL ||
9438 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9439 return false;
9440
9441 // FIXME: These checks must match the similar ones in
9442 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9443 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9444 // or MUL + ADDSUB to FMADDSUB.
9445 const TargetOptions &Options = DAG.getTarget().Options;
9446 bool AllowFusion =
9447 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9448 if (!AllowFusion)
9449 return false;
9450
9451 Opnd2 = Opnd1;
9452 Opnd1 = Opnd0.getOperand(1);
9453 Opnd0 = Opnd0.getOperand(0);
9454
9455 return true;
9456}
9457
9458/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9459/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9460/// X86ISD::FMSUBADD node.
9461static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9462 const X86Subtarget &Subtarget,
9463 SelectionDAG &DAG) {
9464 SDValue Opnd0, Opnd1;
9465 unsigned NumExtracts;
9466 bool IsSubAdd;
9467 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9468 IsSubAdd))
9469 return SDValue();
9470
9471 MVT VT = BV->getSimpleValueType(0);
9472 SDLoc DL(BV);
9473
9474 // Try to generate X86ISD::FMADDSUB node here.
9475 SDValue Opnd2;
9476 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9477 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9478 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9479 }
9480
9481 // We only support ADDSUB.
9482 if (IsSubAdd)
9483 return SDValue();
9484
9485 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9486 // the ADDSUB idiom has been successfully recognized. There are no known
9487 // X86 targets with 512-bit ADDSUB instructions!
9488 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9489 // recognition.
9490 if (VT.is512BitVector())
9491 return SDValue();
9492
9493 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9494}
9495
9496static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9497 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9498 // Initialize outputs to known values.
9499 MVT VT = BV->getSimpleValueType(0);
9500 HOpcode = ISD::DELETED_NODE;
9501 V0 = DAG.getUNDEF(VT);
9502 V1 = DAG.getUNDEF(VT);
9503
9504 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9505 // half of the result is calculated independently from the 128-bit halves of
9506 // the inputs, so that makes the index-checking logic below more complicated.
9507 unsigned NumElts = VT.getVectorNumElements();
9508 unsigned GenericOpcode = ISD::DELETED_NODE;
9509 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9510 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9511 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9512 for (unsigned i = 0; i != Num128BitChunks; ++i) {
9513 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9514 // Ignore undef elements.
9515 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9516 if (Op.isUndef())
9517 continue;
9518
9519 // If there's an opcode mismatch, we're done.
9520 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9521 return false;
9522
9523 // Initialize horizontal opcode.
9524 if (HOpcode == ISD::DELETED_NODE) {
9525 GenericOpcode = Op.getOpcode();
9526 switch (GenericOpcode) {
9527 case ISD::ADD: HOpcode = X86ISD::HADD; break;
9528 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9529 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9530 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9531 default: return false;
9532 }
9533 }
9534
9535 SDValue Op0 = Op.getOperand(0);
9536 SDValue Op1 = Op.getOperand(1);
9537 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9538 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9539 Op0.getOperand(0) != Op1.getOperand(0) ||
9540 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9541 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9542 return false;
9543
9544 // The source vector is chosen based on which 64-bit half of the
9545 // destination vector is being calculated.
9546 if (j < NumEltsIn64Bits) {
9547 if (V0.isUndef())
9548 V0 = Op0.getOperand(0);
9549 } else {
9550 if (V1.isUndef())
9551 V1 = Op0.getOperand(0);
9552 }
9553
9554 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9555 if (SourceVec != Op0.getOperand(0))
9556 return false;
9557
9558 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9559 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9560 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9561 unsigned ExpectedIndex = i * NumEltsIn128Bits +
9562 (j % NumEltsIn64Bits) * 2;
9563 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9564 continue;
9565
9566 // If this is not a commutative op, this does not match.
9567 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9568 return false;
9569
9570 // Addition is commutative, so try swapping the extract indexes.
9571 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9572 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9573 continue;
9574
9575 // Extract indexes do not match horizontal requirement.
9576 return false;
9577 }
9578 }
9579 // We matched. Opcode and operands are returned by reference as arguments.
9580 return true;
9581}
9582
9583static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9584 SelectionDAG &DAG, unsigned HOpcode,
9585 SDValue V0, SDValue V1) {
9586 // If either input vector is not the same size as the build vector,
9587 // extract/insert the low bits to the correct size.
9588 // This is free (examples: zmm --> xmm, xmm --> ymm).
9589 MVT VT = BV->getSimpleValueType(0);
9590 unsigned Width = VT.getSizeInBits();
9591 if (V0.getValueSizeInBits() > Width)
9592 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9593 else if (V0.getValueSizeInBits() < Width)
9594 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9595
9596 if (V1.getValueSizeInBits() > Width)
9597 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9598 else if (V1.getValueSizeInBits() < Width)
9599 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9600
9601 unsigned NumElts = VT.getVectorNumElements();
9602 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9603 for (unsigned i = 0; i != NumElts; ++i)
9604 if (BV->getOperand(i).isUndef())
9605 DemandedElts.clearBit(i);
9606
9607 // If we don't need the upper xmm, then perform as a xmm hop.
9608 unsigned HalfNumElts = NumElts / 2;
9609 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9610 MVT HalfVT = VT.getHalfNumVectorElementsVT();
9611 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
9612 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
9613 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
9614 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
9615 }
9616
9617 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
9618}
9619
9620/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9621static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
9622 const X86Subtarget &Subtarget,
9623 SelectionDAG &DAG) {
9624 // We need at least 2 non-undef elements to make this worthwhile by default.
9625 unsigned NumNonUndefs =
9626 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9627 if (NumNonUndefs < 2)
9628 return SDValue();
9629
9630 // There are 4 sets of horizontal math operations distinguished by type:
9631 // int/FP at 128-bit/256-bit. Each type was introduced with a different
9632 // subtarget feature. Try to match those "native" patterns first.
9633 MVT VT = BV->getSimpleValueType(0);
9634 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9635 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9636 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9637 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9638 unsigned HOpcode;
9639 SDValue V0, V1;
9640 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9641 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
9642 }
9643
9644 // Try harder to match 256-bit ops by using extract/concat.
9645 if (!Subtarget.hasAVX() || !VT.is256BitVector())
9646 return SDValue();
9647
9648 // Count the number of UNDEF operands in the build_vector in input.
9649 unsigned NumElts = VT.getVectorNumElements();
9650 unsigned Half = NumElts / 2;
9651 unsigned NumUndefsLO = 0;
9652 unsigned NumUndefsHI = 0;
9653 for (unsigned i = 0, e = Half; i != e; ++i)
9654 if (BV->getOperand(i)->isUndef())
9655 NumUndefsLO++;
9656
9657 for (unsigned i = Half, e = NumElts; i != e; ++i)
9658 if (BV->getOperand(i)->isUndef())
9659 NumUndefsHI++;
9660
9661 SDLoc DL(BV);
9662 SDValue InVec0, InVec1;
9663 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9664 SDValue InVec2, InVec3;
9665 unsigned X86Opcode;
9666 bool CanFold = true;
9667
9668 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
9669 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
9670 InVec3) &&
9671 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9672 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9673 X86Opcode = X86ISD::HADD;
9674 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
9675 InVec1) &&
9676 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
9677 InVec3) &&
9678 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9679 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9680 X86Opcode = X86ISD::HSUB;
9681 else
9682 CanFold = false;
9683
9684 if (CanFold) {
9685 // Do not try to expand this build_vector into a pair of horizontal
9686 // add/sub if we can emit a pair of scalar add/sub.
9687 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9688 return SDValue();
9689
9690 // Convert this build_vector into a pair of horizontal binops followed by
9691 // a concat vector. We must adjust the outputs from the partial horizontal
9692 // matching calls above to account for undefined vector halves.
9693 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9694 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9695 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"
) ? static_cast<void> (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9695, __PRETTY_FUNCTION__))
;
9696 bool isUndefLO = NumUndefsLO == Half;
9697 bool isUndefHI = NumUndefsHI == Half;
9698 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9699 isUndefHI);
9700 }
9701 }
9702
9703 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9704 VT == MVT::v16i16) {
9705 unsigned X86Opcode;
9706 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
9707 X86Opcode = X86ISD::HADD;
9708 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
9709 InVec1))
9710 X86Opcode = X86ISD::HSUB;
9711 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
9712 InVec1))
9713 X86Opcode = X86ISD::FHADD;
9714 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
9715 InVec1))
9716 X86Opcode = X86ISD::FHSUB;
9717 else
9718 return SDValue();
9719
9720 // Don't try to expand this build_vector into a pair of horizontal add/sub
9721 // if we can simply emit a pair of scalar add/sub.
9722 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9723 return SDValue();
9724
9725 // Convert this build_vector into two horizontal add/sub followed by
9726 // a concat vector.
9727 bool isUndefLO = NumUndefsLO == Half;
9728 bool isUndefHI = NumUndefsHI == Half;
9729 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9730 isUndefLO, isUndefHI);
9731 }
9732
9733 return SDValue();
9734}
9735
9736static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
9737 SelectionDAG &DAG);
9738
9739/// If a BUILD_VECTOR's source elements all apply the same bit operation and
9740/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9741/// just apply the bit to the vectors.
9742/// NOTE: Its not in our interest to start make a general purpose vectorizer
9743/// from this, but enough scalar bit operations are created from the later
9744/// legalization + scalarization stages to need basic support.
9745static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
9746 const X86Subtarget &Subtarget,
9747 SelectionDAG &DAG) {
9748 SDLoc DL(Op);
9749 MVT VT = Op->getSimpleValueType(0);
9750 unsigned NumElems = VT.getVectorNumElements();
9751 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9752
9753 // Check that all elements have the same opcode.
9754 // TODO: Should we allow UNDEFS and if so how many?
9755 unsigned Opcode = Op->getOperand(0).getOpcode();
9756 for (unsigned i = 1; i < NumElems; ++i)
9757 if (Opcode != Op->getOperand(i).getOpcode())
9758 return SDValue();
9759
9760 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
9761 bool IsShift = false;
9762 switch (Opcode) {
9763 default:
9764 return SDValue();
9765 case ISD::SHL:
9766 case ISD::SRL:
9767 case ISD::SRA:
9768 IsShift = true;
9769 break;
9770 case ISD::AND:
9771 case ISD::XOR:
9772 case ISD::OR:
9773 // Don't do this if the buildvector is a splat - we'd replace one
9774 // constant with an entire vector.
9775 if (Op->getSplatValue())
9776 return SDValue();
9777 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9778 return SDValue();
9779 break;
9780 }
9781
9782 SmallVector<SDValue, 4> LHSElts, RHSElts;
9783 for (SDValue Elt : Op->ops()) {
9784 SDValue LHS = Elt.getOperand(0);
9785 SDValue RHS = Elt.getOperand(1);
9786
9787 // We expect the canonicalized RHS operand to be the constant.
9788 if (!isa<ConstantSDNode>(RHS))
9789 return SDValue();
9790
9791 // Extend shift amounts.
9792 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
9793 if (!IsShift)
9794 return SDValue();
9795 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
9796 }
9797
9798 LHSElts.push_back(LHS);
9799 RHSElts.push_back(RHS);
9800 }
9801
9802 // Limit to shifts by uniform immediates.
9803 // TODO: Only accept vXi8/vXi64 special cases?
9804 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9805 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9806 return SDValue();
9807
9808 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
9809 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
9810 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
9811
9812 if (!IsShift)
9813 return Res;
9814
9815 // Immediately lower the shift to ensure the constant build vector doesn't
9816 // get converted to a constant pool before the shift is lowered.
9817 return LowerShift(Res, Subtarget, DAG);
9818}
9819
9820/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9821/// functionality to do this, so it's all zeros, all ones, or some derivation
9822/// that is cheap to calculate.
9823static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
9824 const X86Subtarget &Subtarget) {
9825 SDLoc DL(Op);
9826 MVT VT = Op.getSimpleValueType();
9827
9828 // Vectors containing all zeros can be matched by pxor and xorps.
9829 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9830 return Op;
9831
9832 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9833 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9834 // vpcmpeqd on 256-bit vectors.
9835 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
9836 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9837 return Op;
9838
9839 return getOnesVector(VT, DAG, DL);
9840 }
9841
9842 return SDValue();
9843}
9844
9845/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
9846/// from a vector of source values and a vector of extraction indices.
9847/// The vectors might be manipulated to match the type of the permute op.
9848static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
9849 SDLoc &DL, SelectionDAG &DAG,
9850 const X86Subtarget &Subtarget) {
9851 MVT ShuffleVT = VT;
9852 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9853 unsigned NumElts = VT.getVectorNumElements();
9854 unsigned SizeInBits = VT.getSizeInBits();
9855
9856 // Adjust IndicesVec to match VT size.
9857 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&((IndicesVec.getValueType().getVectorNumElements() >= NumElts
&& "Illegal variable permute mask size") ? static_cast
<void> (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9858, __PRETTY_FUNCTION__))
9858 "Illegal variable permute mask size")((IndicesVec.getValueType().getVectorNumElements() >= NumElts
&& "Illegal variable permute mask size") ? static_cast
<void> (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9858, __PRETTY_FUNCTION__))
;
9859 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
9860 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
9861 NumElts * VT.getScalarSizeInBits());
9862 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
9863
9864 // Handle SrcVec that don't match VT type.
9865 if (SrcVec.getValueSizeInBits() != SizeInBits) {
9866 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
9867 // Handle larger SrcVec by treating it as a larger permute.
9868 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
9869 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
9870 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9871 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
9872 Subtarget, DAG, SDLoc(IndicesVec));
9873 SDValue NewSrcVec =
9874 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9875 if (NewSrcVec)
9876 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
9877 return SDValue();
9878 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
9879 // Widen smaller SrcVec to match VT.
9880 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
9881 } else
9882 return SDValue();
9883 }
9884
9885 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
9886 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")((isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9886, __PRETTY_FUNCTION__))
;
9887 EVT SrcVT = Idx.getValueType();
9888 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
9889 uint64_t IndexScale = 0;
9890 uint64_t IndexOffset = 0;
9891
9892 // If we're scaling a smaller permute op, then we need to repeat the
9893 // indices, scaling and offsetting them as well.
9894 // e.g. v4i32 -> v16i8 (Scale = 4)
9895 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
9896 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
9897 for (uint64_t i = 0; i != Scale; ++i) {
9898 IndexScale |= Scale << (i * NumDstBits);
9899 IndexOffset |= i << (i * NumDstBits);
9900 }
9901
9902 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
9903 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
9904 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
9905 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
9906 return Idx;
9907 };
9908
9909 unsigned Opcode = 0;
9910 switch (VT.SimpleTy) {
9911 default:
9912 break;
9913 case MVT::v16i8:
9914 if (Subtarget.hasSSSE3())
9915 Opcode = X86ISD::PSHUFB;
9916 break;
9917 case MVT::v8i16:
9918 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9919 Opcode = X86ISD::VPERMV;
9920 else if (Subtarget.hasSSSE3()) {
9921 Opcode = X86ISD::PSHUFB;
9922 ShuffleVT = MVT::v16i8;
9923 }
9924 break;
9925 case MVT::v4f32:
9926 case MVT::v4i32:
9927 if (Subtarget.hasAVX()) {
9928 Opcode = X86ISD::VPERMILPV;
9929 ShuffleVT = MVT::v4f32;
9930 } else if (Subtarget.hasSSSE3()) {
9931 Opcode = X86ISD::PSHUFB;
9932 ShuffleVT = MVT::v16i8;
9933 }
9934 break;
9935 case MVT::v2f64:
9936 case MVT::v2i64:
9937 if (Subtarget.hasAVX()) {
9938 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
9939 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9940 Opcode = X86ISD::VPERMILPV;
9941 ShuffleVT = MVT::v2f64;
9942 } else if (Subtarget.hasSSE41()) {
9943 // SSE41 can compare v2i64 - select between indices 0 and 1.
9944 return DAG.getSelectCC(
9945 DL, IndicesVec,
9946 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
9947 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
9948 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
9949 ISD::CondCode::SETEQ);
9950 }
9951 break;
9952 case MVT::v32i8:
9953 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9954 Opcode = X86ISD::VPERMV;
9955 else if (Subtarget.hasXOP()) {
9956 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
9957 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9958 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9959 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9960 return DAG.getNode(
9961 ISD::CONCAT_VECTORS, DL, VT,
9962 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9963 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9964 } else if (Subtarget.hasAVX()) {
9965 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9966 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9967 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9968 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9969 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9970 ArrayRef<SDValue> Ops) {
9971 // Permute Lo and Hi and then select based on index range.
9972 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9973 // care about the bit[7] as its just an index vector.
9974 SDValue Idx = Ops[2];
9975 EVT VT = Idx.getValueType();
9976 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9977 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9978 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9979 ISD::CondCode::SETGT);
9980 };
9981 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9982 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9983 PSHUFBBuilder);
9984 }
9985 break;
9986 case MVT::v16i16:
9987 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9988 Opcode = X86ISD::VPERMV;
9989 else if (Subtarget.hasAVX()) {
9990 // Scale to v32i8 and perform as v32i8.
9991 IndicesVec = ScaleIndices(IndicesVec, 2);
9992 return DAG.getBitcast(
9993 VT, createVariablePermute(
9994 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9995 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9996 }
9997 break;
9998 case MVT::v8f32:
9999 case MVT::v8i32:
10000 if (Subtarget.hasAVX2())
10001 Opcode = X86ISD::VPERMV;
10002 else if (Subtarget.hasAVX()) {
10003 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10004 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10005 {0, 1, 2, 3, 0, 1, 2, 3});
10006 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10007 {4, 5, 6, 7, 4, 5, 6, 7});
10008 if (Subtarget.hasXOP())
10009 return DAG.getBitcast(
10010 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10011 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10012 // Permute Lo and Hi and then select based on index range.
10013 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10014 SDValue Res = DAG.getSelectCC(
10015 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10016 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10017 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10018 ISD::CondCode::SETGT);
10019 return DAG.getBitcast(VT, Res);
10020 }
10021 break;
10022 case MVT::v4i64:
10023 case MVT::v4f64:
10024 if (Subtarget.hasAVX512()) {
10025 if (!Subtarget.hasVLX()) {
10026 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10027 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10028 SDLoc(SrcVec));
10029 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10030 DAG, SDLoc(IndicesVec));
10031 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10032 DAG, Subtarget);
10033 return extract256BitVector(Res, 0, DAG, DL);
10034 }
10035 Opcode = X86ISD::VPERMV;
10036 } else if (Subtarget.hasAVX()) {
10037 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10038 SDValue LoLo =
10039 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10040 SDValue HiHi =
10041 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10042 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10043 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10044 if (Subtarget.hasXOP())
10045 return DAG.getBitcast(
10046 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10047 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10048 // Permute Lo and Hi and then select based on index range.
10049 // This works as VPERMILPD only uses index bit[1] to permute elements.
10050 SDValue Res = DAG.getSelectCC(
10051 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10052 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10053 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10054 ISD::CondCode::SETGT);
10055 return DAG.getBitcast(VT, Res);
10056 }
10057 break;
10058 case MVT::v64i8:
10059 if (Subtarget.hasVBMI())
10060 Opcode = X86ISD::VPERMV;
10061 break;
10062 case MVT::v32i16:
10063 if (Subtarget.hasBWI())
10064 Opcode = X86ISD::VPERMV;
10065 break;
10066 case MVT::v16f32:
10067 case MVT::v16i32:
10068 case MVT::v8f64:
10069 case MVT::v8i64:
10070 if (Subtarget.hasAVX512())
10071 Opcode = X86ISD::VPERMV;
10072 break;
10073 }
10074 if (!Opcode)
10075 return SDValue();
10076
10077 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10079, __PRETTY_FUNCTION__))
10078 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10079, __PRETTY_FUNCTION__))
10079 "Illegal variable permute shuffle type")(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10079, __PRETTY_FUNCTION__))
;
10080
10081 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10082 if (Scale > 1)
10083 IndicesVec = ScaleIndices(IndicesVec, Scale);
10084
10085 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10086 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10087
10088 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10089 SDValue Res = Opcode == X86ISD::VPERMV
10090 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10091 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10092 return DAG.getBitcast(VT, Res);
10093}
10094
10095// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10096// reasoned to be a permutation of a vector by indices in a non-constant vector.
10097// (build_vector (extract_elt V, (extract_elt I, 0)),
10098// (extract_elt V, (extract_elt I, 1)),
10099// ...
10100// ->
10101// (vpermv I, V)
10102//
10103// TODO: Handle undefs
10104// TODO: Utilize pshufb and zero mask blending to support more efficient
10105// construction of vectors with constant-0 elements.
10106static SDValue
10107LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10108 const X86Subtarget &Subtarget) {
10109 SDValue SrcVec, IndicesVec;
10110 // Check for a match of the permute source vector and permute index elements.
10111 // This is done by checking that the i-th build_vector operand is of the form:
10112 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10113 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10114 SDValue Op = V.getOperand(Idx);
10115 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10116 return SDValue();
10117
10118 // If this is the first extract encountered in V, set the source vector,
10119 // otherwise verify the extract is from the previously defined source
10120 // vector.
10121 if (!SrcVec)
10122 SrcVec = Op.getOperand(0);
10123 else if (SrcVec != Op.getOperand(0))
10124 return SDValue();
10125 SDValue ExtractedIndex = Op->getOperand(1);
10126 // Peek through extends.
10127 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10128 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10129 ExtractedIndex = ExtractedIndex.getOperand(0);
10130 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10131 return SDValue();
10132
10133 // If this is the first extract from the index vector candidate, set the
10134 // indices vector, otherwise verify the extract is from the previously
10135 // defined indices vector.
10136 if (!IndicesVec)
10137 IndicesVec = ExtractedIndex.getOperand(0);
10138 else if (IndicesVec != ExtractedIndex.getOperand(0))
10139 return SDValue();
10140
10141 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10142 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10143 return SDValue();
10144 }
10145
10146 SDLoc DL(V);
10147 MVT VT = V.getSimpleValueType();
10148 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10149}
10150
10151SDValue
10152X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10153 SDLoc dl(Op);
10154
10155 MVT VT = Op.getSimpleValueType();
10156 MVT EltVT = VT.getVectorElementType();
10157 unsigned NumElems = Op.getNumOperands();
10158
10159 // Generate vectors for predicate vectors.
10160 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10161 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10162
10163 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10164 return VectorConstant;
10165
10166 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10167 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10168 return AddSub;
10169 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10170 return HorizontalOp;
10171 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10172 return Broadcast;
10173 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10174 return BitOp;
10175
10176 unsigned EVTBits = EltVT.getSizeInBits();
10177
10178 unsigned NumZero = 0;
10179 unsigned NumNonZero = 0;
10180 uint64_t NonZeros = 0;
10181 bool IsAllConstants = true;
10182 SmallSet<SDValue, 8> Values;
10183 unsigned NumConstants = NumElems;
10184 for (unsigned i = 0; i < NumElems; ++i) {
10185 SDValue Elt = Op.getOperand(i);
10186 if (Elt.isUndef())
10187 continue;
10188 Values.insert(Elt);
10189 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10190 IsAllConstants = false;
10191 NumConstants--;
10192 }
10193 if (X86::isZeroNode(Elt))
10194 NumZero++;
10195 else {
10196 assert(i < sizeof(NonZeros) * 8)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * 8", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10196, __PRETTY_FUNCTION__))
; // Make sure the shift is within range.
10197 NonZeros |= ((uint64_t)1 << i);
10198 NumNonZero++;
10199 }
10200 }
10201
10202 // All undef vector. Return an UNDEF. All zero vectors were handled above.
10203 if (NumNonZero == 0)
10204 return DAG.getUNDEF(VT);
10205
10206 // If we are inserting one variable into a vector of non-zero constants, try
10207 // to avoid loading each constant element as a scalar. Load the constants as a
10208 // vector and then insert the variable scalar element. If insertion is not
10209 // supported, fall back to a shuffle to get the scalar blended with the
10210 // constants. Insertion into a zero vector is handled as a special-case
10211 // somewhere below here.
10212 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10213 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10214 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10215 // Create an all-constant vector. The variable element in the old
10216 // build vector is replaced by undef in the constant vector. Save the
10217 // variable scalar element and its index for use in the insertelement.
10218 LLVMContext &Context = *DAG.getContext();
10219 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10220 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10221 SDValue VarElt;
10222 SDValue InsIndex;
10223 for (unsigned i = 0; i != NumElems; ++i) {
10224 SDValue Elt = Op.getOperand(i);
10225 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10226 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10227 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10228 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10229 else if (!Elt.isUndef()) {
10230 assert(!VarElt.getNode() && !InsIndex.getNode() &&((!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector") ? static_cast
<void> (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10231, __PRETTY_FUNCTION__))
10231 "Expected one variable element in this vector")((!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector") ? static_cast
<void> (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10231, __PRETTY_FUNCTION__))
;
10232 VarElt = Elt;
10233 InsIndex = DAG.getVectorIdxConstant(i, dl);
10234 }
10235 }
10236 Constant *CV = ConstantVector::get(ConstVecOps);
10237 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10238
10239 // The constants we just created may not be legal (eg, floating point). We
10240 // must lower the vector right here because we can not guarantee that we'll
10241 // legalize it before loading it. This is also why we could not just create
10242 // a new build vector here. If the build vector contains illegal constants,
10243 // it could get split back up into a series of insert elements.
10244 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10245 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10246 MachineFunction &MF = DAG.getMachineFunction();
10247 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10248 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10249 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10250 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10251 if (InsertC < NumEltsInLow128Bits)
10252 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10253
10254 // There's no good way to insert into the high elements of a >128-bit
10255 // vector, so use shuffles to avoid an extract/insert sequence.
10256 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")((VT.getSizeInBits() > 128 && "Invalid insertion index?"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10256, __PRETTY_FUNCTION__))
;
10257 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")((Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10257, __PRETTY_FUNCTION__))
;
10258 SmallVector<int, 8> ShuffleMask;
10259 unsigned NumElts = VT.getVectorNumElements();
10260 for (unsigned i = 0; i != NumElts; ++i)
10261 ShuffleMask.push_back(i == InsertC ? NumElts : i);
10262 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10263 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10264 }
10265
10266 // Special case for single non-zero, non-undef, element.
10267 if (NumNonZero == 1) {
10268 unsigned Idx = countTrailingZeros(NonZeros);
10269 SDValue Item = Op.getOperand(Idx);
10270
10271 // If we have a constant or non-constant insertion into the low element of
10272 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10273 // the rest of the elements. This will be matched as movd/movq/movss/movsd
10274 // depending on what the source datatype is.
10275 if (Idx == 0) {
10276 if (NumZero == 0)
10277 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10278
10279 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
10280 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
10281 assert((VT.is128BitVector() || VT.is256BitVector() ||(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10283, __PRETTY_FUNCTION__))
10282 VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10283, __PRETTY_FUNCTION__))
10283 "Expected an SSE value type!")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10283, __PRETTY_FUNCTION__))
;
10284 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10285 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
10286 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10287 }
10288
10289 // We can't directly insert an i8 or i16 into a vector, so zero extend
10290 // it to i32 first.
10291 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10292 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10293 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
10294 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10295 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10296 return DAG.getBitcast(VT, Item);
10297 }
10298 }
10299
10300 // Is it a vector logical left shift?
10301 if (NumElems == 2 && Idx == 1 &&
10302 X86::isZeroNode(Op.getOperand(0)) &&
10303 !X86::isZeroNode(Op.getOperand(1))) {
10304 unsigned NumBits = VT.getSizeInBits();
10305 return getVShift(true, VT,
10306 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10307 VT, Op.getOperand(1)),
10308 NumBits/2, DAG, *this, dl);
10309 }
10310
10311 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10312 return SDValue();
10313
10314 // Otherwise, if this is a vector with i32 or f32 elements, and the element
10315 // is a non-constant being inserted into an element other than the low one,
10316 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
10317 // movd/movss) to move this into the low element, then shuffle it into
10318 // place.
10319 if (EVTBits == 32) {
10320 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10321 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10322 }
10323 }
10324
10325 // Splat is obviously ok. Let legalizer expand it to a shuffle.
10326 if (Values.size() == 1) {
10327 if (EVTBits == 32) {
10328 // Instead of a shuffle like this:
10329 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10330 // Check if it's possible to issue this instead.
10331 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10332 unsigned Idx = countTrailingZeros(NonZeros);
10333 SDValue Item = Op.getOperand(Idx);
10334 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10335 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10336 }
10337 return SDValue();
10338 }
10339
10340 // A vector full of immediates; various special cases are already
10341 // handled, so this is best done with a single constant-pool load.
10342 if (IsAllConstants)
10343 return SDValue();
10344
10345 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10346 return V;
10347
10348 // See if we can use a vector load to get all of the elements.
10349 {
10350 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10351 if (SDValue LD =
10352 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10353 return LD;
10354 }
10355
10356 // If this is a splat of pairs of 32-bit elements, we can use a narrower
10357 // build_vector and broadcast it.
10358 // TODO: We could probably generalize this more.
10359 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10360 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10361 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10362 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10363 // Make sure all the even/odd operands match.
10364 for (unsigned i = 2; i != NumElems; ++i)
10365 if (Ops[i % 2] != Op.getOperand(i))
10366 return false;
10367 return true;
10368 };
10369 if (CanSplat(Op, NumElems, Ops)) {
10370 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10371 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10372 // Create a new build vector and cast to v2i64/v2f64.
10373 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10374 DAG.getBuildVector(NarrowVT, dl, Ops));
10375 // Broadcast from v2i64/v2f64 and cast to final VT.
10376 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
10377 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10378 NewBV));
10379 }
10380 }
10381
10382 // For AVX-length vectors, build the individual 128-bit pieces and use
10383 // shuffles to put them in place.
10384 if (VT.getSizeInBits() > 128) {
10385 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
10386
10387 // Build both the lower and upper subvector.
10388 SDValue Lower =
10389 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10390 SDValue Upper = DAG.getBuildVector(
10391 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10392
10393 // Recreate the wider vector with the lower and upper part.
10394 return concatSubVectors(Lower, Upper, DAG, dl);
10395 }
10396
10397 // Let legalizer expand 2-wide build_vectors.
10398 if (EVTBits == 64) {
10399 if (NumNonZero == 1) {
10400 // One half is zero or undef.
10401 unsigned Idx = countTrailingZeros(NonZeros);
10402 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10403 Op.getOperand(Idx));
10404 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10405 }
10406 return SDValue();
10407 }
10408
10409 // If element VT is < 32 bits, convert it to inserts into a zero vector.
10410 if (EVTBits == 8 && NumElems == 16)
10411 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
10412 DAG, Subtarget))
10413 return V;
10414
10415 if (EVTBits == 16 && NumElems == 8)
10416 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
10417 DAG, Subtarget))
10418 return V;
10419
10420 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10421 if (EVTBits == 32 && NumElems == 4)
10422 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10423 return V;
10424
10425 // If element VT is == 32 bits, turn it into a number of shuffles.
10426 if (NumElems == 4 && NumZero > 0) {
10427 SmallVector<SDValue, 8> Ops(NumElems);
10428 for (unsigned i = 0; i < 4; ++i) {
10429 bool isZero = !(NonZeros & (1ULL << i));
10430 if (isZero)
10431 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10432 else
10433 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10434 }
10435
10436 for (unsigned i = 0; i < 2; ++i) {
10437 switch ((NonZeros >> (i*2)) & 0x3) {
10438 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10438)
;
10439 case 0:
10440 Ops[i] = Ops[i*2]; // Must be a zero vector.
10441 break;
10442 case 1:
10443 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10444 break;
10445 case 2:
10446 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10447 break;
10448 case 3:
10449 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10450 break;
10451 }
10452 }
10453
10454 bool Reverse1 = (NonZeros & 0x3) == 2;
10455 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
10456 int MaskVec[] = {
10457 Reverse1 ? 1 : 0,
10458 Reverse1 ? 0 : 1,
10459 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10460 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10461 };
10462 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10463 }
10464
10465 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")((Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? static_cast<void> (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10465, __PRETTY_FUNCTION__))
;
10466
10467 // Check for a build vector from mostly shuffle plus few inserting.
10468 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10469 return Sh;
10470
10471 // For SSE 4.1, use insertps to put the high elements into the low element.
10472 if (Subtarget.hasSSE41()) {
10473 SDValue Result;
10474 if (!Op.getOperand(0).isUndef())
10475 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10476 else
10477 Result = DAG.getUNDEF(VT);
10478
10479 for (unsigned i = 1; i < NumElems; ++i) {
10480 if (Op.getOperand(i).isUndef()) continue;
10481 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10482 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10483 }
10484 return Result;
10485 }
10486
10487 // Otherwise, expand into a number of unpckl*, start by extending each of
10488 // our (non-undef) elements to the full vector width with the element in the
10489 // bottom slot of the vector (which generates no code for SSE).
10490 SmallVector<SDValue, 8> Ops(NumElems);
10491 for (unsigned i = 0; i < NumElems; ++i) {
10492 if (!Op.getOperand(i).isUndef())
10493 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10494 else
10495 Ops[i] = DAG.getUNDEF(VT);
10496 }
10497
10498 // Next, we iteratively mix elements, e.g. for v4f32:
10499 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10500 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10501 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10502 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10503 // Generate scaled UNPCKL shuffle mask.
10504 SmallVector<int, 16> Mask;
10505 for(unsigned i = 0; i != Scale; ++i)
10506 Mask.push_back(i);
10507 for (unsigned i = 0; i != Scale; ++i)
10508 Mask.push_back(NumElems+i);
10509 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10510
10511 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10512 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10513 }
10514 return Ops[0];
10515}
10516
10517// 256-bit AVX can use the vinsertf128 instruction
10518// to create 256-bit vectors from two other 128-bit ones.
10519// TODO: Detect subvector broadcast here instead of DAG combine?
10520static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10521 const X86Subtarget &Subtarget) {
10522 SDLoc dl(Op);
10523 MVT ResVT = Op.getSimpleValueType();
10524
10525 assert((ResVT.is256BitVector() ||(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10526, __PRETTY_FUNCTION__))
10526 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10526, __PRETTY_FUNCTION__))
;
10527
10528 unsigned NumOperands = Op.getNumOperands();
10529 unsigned NumZero = 0;
10530 unsigned NumNonZero = 0;
10531 unsigned NonZeros = 0;
10532 for (unsigned i = 0; i != NumOperands; ++i) {
10533 SDValue SubVec = Op.getOperand(i);
10534 if (SubVec.isUndef())
10535 continue;
10536 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10537 ++NumZero;
10538 else {
10539 assert(i < sizeof(NonZeros) * CHAR_BIT)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10539, __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
10540 NonZeros |= 1 << i;
10541 ++NumNonZero;
10542 }
10543 }
10544
10545 // If we have more than 2 non-zeros, build each half separately.
10546 if (NumNonZero > 2) {
10547 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10548 ArrayRef<SDUse> Ops = Op->ops();
10549 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10550 Ops.slice(0, NumOperands/2));
10551 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10552 Ops.slice(NumOperands/2));
10553 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10554 }
10555
10556 // Otherwise, build it up through insert_subvectors.
10557 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10558 : DAG.getUNDEF(ResVT);
10559
10560 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10561 unsigned NumSubElems = SubVT.getVectorNumElements();
10562 for (unsigned i = 0; i != NumOperands; ++i) {
10563 if ((NonZeros & (1 << i)) == 0)
10564 continue;
10565
10566 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10567 Op.getOperand(i),
10568 DAG.getIntPtrConstant(i * NumSubElems, dl));
10569 }
10570
10571 return Vec;
10572}
10573
10574// Returns true if the given node is a type promotion (by concatenating i1
10575// zeros) of the result of a node that already zeros all upper bits of
10576// k-register.
10577// TODO: Merge this with LowerAVXCONCAT_VECTORS?
10578static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
10579 const X86Subtarget &Subtarget,
10580 SelectionDAG & DAG) {
10581 SDLoc dl(Op);
10582 MVT ResVT = Op.getSimpleValueType();
10583 unsigned NumOperands = Op.getNumOperands();
10584
10585 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&((NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS") ? static_cast
<void> (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10586, __PRETTY_FUNCTION__))
10586 "Unexpected number of operands in CONCAT_VECTORS")((NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS") ? static_cast
<void> (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10586, __PRETTY_FUNCTION__))
;
10587
10588 uint64_t Zeros = 0;
10589 uint64_t NonZeros = 0;
10590 for (unsigned i = 0; i != NumOperands; ++i) {
10591 SDValue SubVec = Op.getOperand(i);
10592 if (SubVec.isUndef())
10593 continue;
10594 assert(i < sizeof(NonZeros) * CHAR_BIT)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10594, __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
10595 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10596 Zeros |= (uint64_t)1 << i;
10597 else
10598 NonZeros |= (uint64_t)1 << i;
10599 }
10600
10601 unsigned NumElems = ResVT.getVectorNumElements();
10602
10603 // If we are inserting non-zero vector and there are zeros in LSBs and undef
10604 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10605 // insert_subvector will give us two kshifts.
10606 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10607 Log2_64(NonZeros) != NumOperands - 1) {
10608 MVT ShiftVT = ResVT;
10609 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
10610 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
10611 unsigned Idx = Log2_64(NonZeros);
10612 SDValue SubVec = Op.getOperand(Idx);
10613 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10614 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
10615 DAG.getUNDEF(ShiftVT), SubVec,
10616 DAG.getIntPtrConstant(0, dl));
10617 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
10618 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10619 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10620 DAG.getIntPtrConstant(0, dl));
10621 }
10622
10623 // If there are zero or one non-zeros we can handle this very simply.
10624 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10625 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10626 if (!NonZeros)
10627 return Vec;
10628 unsigned Idx = Log2_64(NonZeros);
10629 SDValue SubVec = Op.getOperand(Idx);
10630 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10631 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10632 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
10633 }
10634
10635 if (NumOperands > 2) {
10636 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10637 ArrayRef<SDUse> Ops = Op->ops();
10638 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10639 Ops.slice(0, NumOperands/2));
10640 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10641 Ops.slice(NumOperands/2));
10642 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10643 }
10644
10645 assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")((countPopulation(NonZeros) == 2 && "Simple cases not handled?"
) ? static_cast<void> (0) : __assert_fail ("countPopulation(NonZeros) == 2 && \"Simple cases not handled?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10645, __PRETTY_FUNCTION__))
;
10646
10647 if (ResVT.getVectorNumElements() >= 16)
10648 return Op; // The operation is legal with KUNPCK
10649
10650 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
10651 DAG.getUNDEF(ResVT), Op.getOperand(0),
10652 DAG.getIntPtrConstant(0, dl));
10653 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10654 DAG.getIntPtrConstant(NumElems/2, dl));
10655}
10656
10657static SDValue LowerCONCAT_VECTORS(SDValue Op,
10658 const X86Subtarget &Subtarget,
10659 SelectionDAG &DAG) {
10660 MVT VT = Op.getSimpleValueType();
10661 if (VT.getVectorElementType() == MVT::i1)
10662 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
10663
10664 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10666, __PRETTY_FUNCTION__))
10665 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10666, __PRETTY_FUNCTION__))
10666 Op.getNumOperands() == 4)))(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10666, __PRETTY_FUNCTION__))
;
10667
10668 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10669 // from two other 128-bit ones.
10670
10671 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10672 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
10673}
10674
10675//===----------------------------------------------------------------------===//
10676// Vector shuffle lowering
10677//
10678// This is an experimental code path for lowering vector shuffles on x86. It is
10679// designed to handle arbitrary vector shuffles and blends, gracefully
10680// degrading performance as necessary. It works hard to recognize idiomatic
10681// shuffles and lower them to optimal instruction patterns without leaving
10682// a framework that allows reasonably efficient handling of all vector shuffle
10683// patterns.
10684//===----------------------------------------------------------------------===//
10685
10686/// Tiny helper function to identify a no-op mask.
10687///
10688/// This is a somewhat boring predicate function. It checks whether the mask
10689/// array input, which is assumed to be a single-input shuffle mask of the kind
10690/// used by the X86 shuffle instructions (not a fully general
10691/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10692/// in-place shuffle are 'no-op's.
10693static bool isNoopShuffleMask(ArrayRef<int> Mask) {
10694 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10695 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10695, __PRETTY_FUNCTION__))
;
10696 if (Mask[i] >= 0 && Mask[i] != i)
10697 return false;
10698 }
10699 return true;
10700}
10701
10702/// Test whether there are elements crossing LaneSizeInBits lanes in this
10703/// shuffle mask.
10704///
10705/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10706/// and we routinely test for these.
10707static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10708 unsigned ScalarSizeInBits,
10709 ArrayRef<int> Mask) {
10710 assert(LaneSizeInBits && ScalarSizeInBits &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10712, __PRETTY_FUNCTION__))
10711 (LaneSizeInBits % ScalarSizeInBits) == 0 &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10712, __PRETTY_FUNCTION__))
10712 "Illegal shuffle lane size")((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10712, __PRETTY_FUNCTION__))
;
10713 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10714 int Size = Mask.size();
10715 for (int i = 0; i < Size; ++i)
10716 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10717 return true;
10718 return false;
10719}
10720
10721/// Test whether there are elements crossing 128-bit lanes in this
10722/// shuffle mask.
10723static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
10724 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10725}
10726
10727/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10728/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10729/// better support 'repeated mask + lane permute' style shuffles.
10730static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10731 unsigned ScalarSizeInBits,
10732 ArrayRef<int> Mask) {
10733 assert(LaneSizeInBits && ScalarSizeInBits &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10735, __PRETTY_FUNCTION__))
10734 (LaneSizeInBits % ScalarSizeInBits) == 0 &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10735, __PRETTY_FUNCTION__))
10735 "Illegal shuffle lane size")((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10735, __PRETTY_FUNCTION__))
;
10736 int NumElts = Mask.size();
10737 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10738 int NumLanes = NumElts / NumEltsPerLane;
10739 if (NumLanes > 1) {
10740 for (int i = 0; i != NumLanes; ++i) {
10741 int SrcLane = -1;
10742 for (int j = 0; j != NumEltsPerLane; ++j) {
10743 int M = Mask[(i * NumEltsPerLane) + j];
10744 if (M < 0)
10745 continue;
10746 int Lane = (M % NumElts) / NumEltsPerLane;
10747 if (SrcLane >= 0 && SrcLane != Lane)
10748 return true;
10749 SrcLane = Lane;
10750 }
10751 }
10752 }
10753 return false;
10754}
10755
10756/// Test whether a shuffle mask is equivalent within each sub-lane.
10757///
10758/// This checks a shuffle mask to see if it is performing the same
10759/// lane-relative shuffle in each sub-lane. This trivially implies
10760/// that it is also not lane-crossing. It may however involve a blend from the
10761/// same lane of a second vector.
10762///
10763/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10764/// non-trivial to compute in the face of undef lanes. The representation is
10765/// suitable for use with existing 128-bit shuffles as entries from the second
10766/// vector have been remapped to [LaneSize, 2*LaneSize).
10767static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10768 ArrayRef<int> Mask,
10769 SmallVectorImpl<int> &RepeatedMask) {
10770 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10771 RepeatedMask.assign(LaneSize, -1);
10772 int Size = Mask.size();
10773 for (int i = 0; i < Size; ++i) {
10774 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)((Mask[i] == SM_SentinelUndef || Mask[i] >= 0) ? static_cast
<void> (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10774, __PRETTY_FUNCTION__))
;
10775 if (Mask[i] < 0)
10776 continue;
10777 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10778 // This entry crosses lanes, so there is no way to model this shuffle.
10779 return false;
10780
10781 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10782 // Adjust second vector indices to start at LaneSize instead of Size.
10783 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10784 : Mask[i] % LaneSize + LaneSize;
10785 if (RepeatedMask[i % LaneSize] < 0)
10786 // This is the first non-undef entry in this slot of a 128-bit lane.
10787 RepeatedMask[i % LaneSize] = LocalM;
10788 else if (RepeatedMask[i % LaneSize] != LocalM)
10789 // Found a mismatch with the repeated mask.
10790 return false;
10791 }
10792 return true;
10793}
10794
10795/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10796static bool
10797is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10798 SmallVectorImpl<int> &RepeatedMask) {
10799 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10800}
10801
10802static bool
10803is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
10804 SmallVector<int, 32> RepeatedMask;
10805 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10806}
10807
10808/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10809static bool
10810is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10811 SmallVectorImpl<int> &RepeatedMask) {
10812 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10813}
10814
10815/// Test whether a target shuffle mask is equivalent within each sub-lane.
10816/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10817static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10818 unsigned EltSizeInBits,
10819 ArrayRef<int> Mask,
10820 SmallVectorImpl<int> &RepeatedMask) {
10821 int LaneSize = LaneSizeInBits / EltSizeInBits;
10822 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10823 int Size = Mask.size();
10824 for (int i = 0; i < Size; ++i) {
10825 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))((isUndefOrZero(Mask[i]) || (Mask[i] >= 0)) ? static_cast<
void> (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10825, __PRETTY_FUNCTION__))
;
10826 if (Mask[i] == SM_SentinelUndef)
10827 continue;
10828 if (Mask[i] == SM_SentinelZero) {
10829 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10830 return false;
10831 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10832 continue;
10833 }
10834 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10835 // This entry crosses lanes, so there is no way to model this shuffle.
10836 return false;
10837
10838 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10839 // Adjust second vector indices to start at LaneSize instead of Size.
10840 int LocalM =
10841 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
10842 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10843 // This is the first non-undef entry in this slot of a 128-bit lane.
10844 RepeatedMask[i % LaneSize] = LocalM;
10845 else if (RepeatedMask[i % LaneSize] != LocalM)
10846 // Found a mismatch with the repeated mask.
10847 return false;
10848 }
10849 return true;
10850}
10851
10852/// Test whether a target shuffle mask is equivalent within each sub-lane.
10853/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10854static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10855 ArrayRef<int> Mask,
10856 SmallVectorImpl<int> &RepeatedMask) {
10857 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10858 Mask, RepeatedMask);
10859}
10860
10861/// Checks whether the vector elements referenced by two shuffle masks are
10862/// equivalent.
10863static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
10864 int Idx, int ExpectedIdx) {
10865 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&((0 <= Idx && Idx < MaskSize && 0 <=
ExpectedIdx && ExpectedIdx < MaskSize && "Out of range element index"
) ? static_cast<void> (0) : __assert_fail ("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10866, __PRETTY_FUNCTION__))
10866 ExpectedIdx < MaskSize && "Out of range element index")((0 <= Idx && Idx < MaskSize && 0 <=
ExpectedIdx && ExpectedIdx < MaskSize && "Out of range element index"
) ? static_cast<void> (0) : __assert_fail ("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10866, __PRETTY_FUNCTION__))
;
10867 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
10868 return false;
10869
10870 switch (Op.getOpcode()) {
10871 case ISD::BUILD_VECTOR:
10872 // If the values are build vectors, we can look through them to find
10873 // equivalent inputs that make the shuffles equivalent.
10874 // TODO: Handle MaskSize != Op.getNumOperands()?
10875 if (MaskSize == (int)Op.getNumOperands() &&
10876 MaskSize == (int)ExpectedOp.getNumOperands())
10877 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
10878 break;
10879 case X86ISD::HADD:
10880 case X86ISD::HSUB:
10881 case X86ISD::FHADD:
10882 case X86ISD::FHSUB:
10883 case X86ISD::PACKSS:
10884 case X86ISD::PACKUS:
10885 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
10886 // TODO: Handle MaskSize != NumElts?
10887 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
10888 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
10889 MVT VT = Op.getSimpleValueType();
10890 int NumElts = VT.getVectorNumElements();
10891 if (MaskSize == NumElts) {
10892 int NumLanes = VT.getSizeInBits() / 128;
10893 int NumEltsPerLane = NumElts / NumLanes;
10894 int NumHalfEltsPerLane = NumEltsPerLane / 2;
10895 bool SameLane =
10896 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
10897 bool SameElt =
10898 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
10899 return SameLane && SameElt;
10900 }
10901 }
10902 break;
10903 }
10904
10905 return false;
10906}
10907
10908/// Checks whether a shuffle mask is equivalent to an explicit list of
10909/// arguments.
10910///
10911/// This is a fast way to test a shuffle mask against a fixed pattern:
10912///
10913/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10914///
10915/// It returns true if the mask is exactly as wide as the argument list, and
10916/// each element of the mask is either -1 (signifying undef) or the value given
10917/// in the argument.
10918static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10919 SDValue V1 = SDValue(),
10920 SDValue V2 = SDValue()) {
10921 int Size = Mask.size();
10922 if (Size != (int)ExpectedMask.size())
10923 return false;
10924
10925 for (int i = 0; i < Size; ++i) {
10926 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10926, __PRETTY_FUNCTION__))
;
10927 int MaskIdx = Mask[i];
10928 int ExpectedIdx = ExpectedMask[i];
10929 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10930 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10931 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10932 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10933 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10934 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10935 return false;
10936 }
10937 }
10938 return true;
10939}
10940
10941/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10942///
10943/// The masks must be exactly the same width.
10944///
10945/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10946/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10947///
10948/// SM_SentinelZero is accepted as a valid negative index but must match in
10949/// both.
10950static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
10951 ArrayRef<int> ExpectedMask,
10952 SDValue V1 = SDValue(),
10953 SDValue V2 = SDValue()) {
10954 int Size = Mask.size();
10955 if (Size != (int)ExpectedMask.size())
10956 return false;
10957 assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&((isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask") ? static_cast<void> (0)
: __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10958, __PRETTY_FUNCTION__))
10958 "Illegal target shuffle mask")((isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask") ? static_cast<void> (0)
: __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10958, __PRETTY_FUNCTION__))
;
10959
10960 // Check for out-of-range target shuffle mask indices.
10961 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10962 return false;
10963
10964 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10965 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
10966 V1 = SDValue();
10967 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
10968 V2 = SDValue();
10969
10970 for (int i = 0; i < Size; ++i) {
10971 int MaskIdx = Mask[i];
10972 int ExpectedIdx = ExpectedMask[i];
10973 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10974 continue;
10975 if (0 <= MaskIdx && 0 <= ExpectedIdx) {
10976 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10977 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10978 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10979 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10980 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10981 continue;
10982 }
10983 // TODO - handle SM_Sentinel equivalences.
10984 return false;
10985 }
10986 return true;
10987}
10988
10989// Attempt to create a shuffle mask from a VSELECT condition mask.
10990static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
10991 SDValue Cond) {
10992 EVT CondVT = Cond.getValueType();
10993 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
10994 unsigned NumElts = CondVT.getVectorNumElements();
10995
10996 APInt UndefElts;
10997 SmallVector<APInt, 32> EltBits;
10998 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
10999 true, false))
11000 return false;
11001
11002 Mask.resize(NumElts, SM_SentinelUndef);
11003
11004 for (int i = 0; i != (int)NumElts; ++i) {
11005 Mask[i] = i;
11006 // Arbitrarily choose from the 2nd operand if the select condition element
11007 // is undef.
11008 // TODO: Can we do better by matching patterns such as even/odd?
11009 if (UndefElts[i] || EltBits[i].isNullValue())
11010 Mask[i] += NumElts;
11011 }
11012
11013 return true;
11014}
11015
11016// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11017// instructions.
11018static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11019 if (VT != MVT::v8i32 && VT != MVT::v8f32)
11020 return false;
11021
11022 SmallVector<int, 8> Unpcklwd;
11023 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11024 /* Unary = */ false);
11025 SmallVector<int, 8> Unpckhwd;
11026 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11027 /* Unary = */ false);
11028 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11029 isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11030 return IsUnpackwdMask;
11031}
11032
11033static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11034 // Create 128-bit vector type based on mask size.
11035 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11036 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11037
11038 // We can't assume a canonical shuffle mask, so try the commuted version too.
11039 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11040 ShuffleVectorSDNode::commuteMask(CommutedMask);
11041
11042 // Match any of unary/binary or low/high.
11043 for (unsigned i = 0; i != 4; ++i) {
11044 SmallVector<int, 16> UnpackMask;
11045 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11046 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11047 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11048 return true;
11049 }
11050 return false;
11051}
11052
11053/// Return true if a shuffle mask chooses elements identically in its top and
11054/// bottom halves. For example, any splat mask has the same top and bottom
11055/// halves. If an element is undefined in only one half of the mask, the halves
11056/// are not considered identical.
11057static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11058 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")((Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11058, __PRETTY_FUNCTION__))
;
11059 unsigned HalfSize = Mask.size() / 2;
11060 for (unsigned i = 0; i != HalfSize; ++i) {
11061 if (Mask[i] != Mask[i + HalfSize])
11062 return false;
11063 }
11064 return true;
11065}
11066
11067/// Get a 4-lane 8-bit shuffle immediate for a mask.
11068///
11069/// This helper function produces an 8-bit shuffle immediate corresponding to
11070/// the ubiquitous shuffle encoding scheme used in x86 instructions for
11071/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11072/// example.
11073///
11074/// NB: We rely heavily on "undef" masks preserving the input lane.
11075static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11076 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")((Mask.size() == 4 && "Only 4-lane shuffle masks") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11076, __PRETTY_FUNCTION__))
;
11077 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11077, __PRETTY_FUNCTION__))
;
11078 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11078, __PRETTY_FUNCTION__))
;
11079 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11079, __PRETTY_FUNCTION__))
;
11080 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11080, __PRETTY_FUNCTION__))
;
11081
11082 // If the mask only uses one non-undef element, then fully 'splat' it to
11083 // improve later broadcast matching.
11084 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11085 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")((0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11085, __PRETTY_FUNCTION__))
;
11086
11087 int FirstElt = Mask[FirstIndex];
11088 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11089 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11090
11091 unsigned Imm = 0;
11092 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11093 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11094 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11095 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11096 return Imm;
11097}
11098
11099static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11100 SelectionDAG &DAG) {
11101 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11102}
11103
11104// The Shuffle result is as follow:
11105// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11106// Each Zeroable's element correspond to a particular Mask's element.
11107// As described in computeZeroableShuffleElements function.
11108//
11109// The function looks for a sub-mask that the nonzero elements are in
11110// increasing order. If such sub-mask exist. The function returns true.
11111static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11112 ArrayRef<int> Mask, const EVT &VectorType,
11113 bool &IsZeroSideLeft) {
11114 int NextElement = -1;
11115 // Check if the Mask's nonzero elements are in increasing order.
11116 for (int i = 0, e = Mask.size(); i < e; i++) {
11117 // Checks if the mask's zeros elements are built from only zeros.
11118 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11118, __PRETTY_FUNCTION__))
;
11119 if (Mask[i] < 0)
11120 return false;
11121 if (Zeroable[i])
11122 continue;
11123 // Find the lowest non zero element
11124 if (NextElement < 0) {
11125 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11126 IsZeroSideLeft = NextElement != 0;
11127 }
11128 // Exit if the mask's non zero elements are not in increasing order.
11129 if (NextElement != Mask[i])
11130 return false;
11131 NextElement++;
11132 }
11133 return true;
11134}
11135
11136/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
11137static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11138 ArrayRef<int> Mask, SDValue V1,
11139 SDValue V2, const APInt &Zeroable,
11140 const X86Subtarget &Subtarget,
11141 SelectionDAG &DAG) {
11142 int Size = Mask.size();
11143 int LaneSize = 128 / VT.getScalarSizeInBits();
11144 const int NumBytes = VT.getSizeInBits() / 8;
11145 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11146
11147 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11149, __PRETTY_FUNCTION__))
11148 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11149, __PRETTY_FUNCTION__))
11149 (Subtarget.hasBWI() && VT.is512BitVector()))(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11149, __PRETTY_FUNCTION__))
;
11150
11151 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11152 // Sign bit set in i8 mask means zero element.
11153 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11154
11155 SDValue V;
11156 for (int i = 0; i < NumBytes; ++i) {
11157 int M = Mask[i / NumEltBytes];
11158 if (M < 0) {
11159 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11160 continue;
11161 }
11162 if (Zeroable[i / NumEltBytes]) {
11163 PSHUFBMask[i] = ZeroMask;
11164 continue;
11165 }
11166
11167 // We can only use a single input of V1 or V2.
11168 SDValue SrcV = (M >= Size ? V2 : V1);
11169 if (V && V != SrcV)
11170 return SDValue();
11171 V = SrcV;
11172 M %= Size;
11173
11174 // PSHUFB can't cross lanes, ensure this doesn't happen.
11175 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11176 return SDValue();
11177
11178 M = M % LaneSize;
11179 M = M * NumEltBytes + (i % NumEltBytes);
11180 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11181 }
11182 assert(V && "Failed to find a source input")((V && "Failed to find a source input") ? static_cast
<void> (0) : __assert_fail ("V && \"Failed to find a source input\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11182, __PRETTY_FUNCTION__))
;
11183
11184 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11185 return DAG.getBitcast(
11186 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11187 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11188}
11189
11190static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11191 const X86Subtarget &Subtarget, SelectionDAG &DAG,
11192 const SDLoc &dl);
11193
11194// X86 has dedicated shuffle that can be lowered to VEXPAND
11195static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11196 const APInt &Zeroable,
11197 ArrayRef<int> Mask, SDValue &V1,
11198 SDValue &V2, SelectionDAG &DAG,
11199 const X86Subtarget &Subtarget) {
11200 bool IsLeftZeroSide = true;
11201 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11202 IsLeftZeroSide))
11203 return SDValue();
11204 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11205 MVT IntegerType =
11206 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11207 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11208 unsigned NumElts = VT.getVectorNumElements();
11209 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11210, __PRETTY_FUNCTION__))
11210 "Unexpected number of vector elements")(((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11210, __PRETTY_FUNCTION__))
;
11211 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11212 Subtarget, DAG, DL);
11213 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11214 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11215 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11216}
11217
11218static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11219 unsigned &UnpackOpcode, bool IsUnary,
11220 ArrayRef<int> TargetMask, const SDLoc &DL,
11221 SelectionDAG &DAG,
11222 const X86Subtarget &Subtarget) {
11223 int NumElts = VT.getVectorNumElements();
11224
11225 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11226 for (int i = 0; i != NumElts; i += 2) {
11227 int M1 = TargetMask[i + 0];
11228 int M2 = TargetMask[i + 1];
11229 Undef1 &= (SM_SentinelUndef == M1);
11230 Undef2 &= (SM_SentinelUndef == M2);
11231 Zero1 &= isUndefOrZero(M1);
11232 Zero2 &= isUndefOrZero(M2);
11233 }
11234 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&((!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected") ? static_cast<void> (0) :
__assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11235, __PRETTY_FUNCTION__))
11235 "Zeroable shuffle detected")((!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected") ? static_cast<void> (0) :
__assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11235, __PRETTY_FUNCTION__))
;
11236
11237 // Attempt to match the target mask against the unpack lo/hi mask patterns.
11238 SmallVector<int, 64> Unpckl, Unpckh;
11239 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11240 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11241 UnpackOpcode = X86ISD::UNPCKL;
11242 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11243 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11244 return true;
11245 }
11246
11247 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11248 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11249 UnpackOpcode = X86ISD::UNPCKH;
11250 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11251 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11252 return true;
11253 }
11254
11255 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11256 if (IsUnary && (Zero1 || Zero2)) {
11257 // Don't bother if we can blend instead.
11258 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11259 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11260 return false;
11261
11262 bool MatchLo = true, MatchHi = true;
11263 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11264 int M = TargetMask[i];
11265
11266 // Ignore if the input is known to be zero or the index is undef.
11267 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11268 (M == SM_SentinelUndef))
11269 continue;
11270
11271 MatchLo &= (M == Unpckl[i]);
11272 MatchHi &= (M == Unpckh[i]);
11273 }
11274
11275 if (MatchLo || MatchHi) {
11276 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11277 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11278 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11279 return true;
11280 }
11281 }
11282
11283 // If a binary shuffle, commute and try again.
11284 if (!IsUnary) {
11285 ShuffleVectorSDNode::commuteMask(Unpckl);
11286 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11287 UnpackOpcode = X86ISD::UNPCKL;
11288 std::swap(V1, V2);
11289 return true;
11290 }
11291
11292 ShuffleVectorSDNode::commuteMask(Unpckh);
11293 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11294 UnpackOpcode = X86ISD::UNPCKH;
11295 std::swap(V1, V2);
11296 return true;
11297 }
11298 }
11299
11300 return false;
11301}
11302
11303// X86 has dedicated unpack instructions that can handle specific blend
11304// operations: UNPCKH and UNPCKL.
11305static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
11306 ArrayRef<int> Mask, SDValue V1, SDValue V2,
11307 SelectionDAG &DAG) {
11308 SmallVector<int, 8> Unpckl;
11309 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11310 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11311 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11312
11313 SmallVector<int, 8> Unpckh;
11314 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11315 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11316 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11317
11318 // Commute and try again.
11319 ShuffleVectorSDNode::commuteMask(Unpckl);
11320 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11321 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11322
11323 ShuffleVectorSDNode::commuteMask(Unpckh);
11324 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11325 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11326
11327 return SDValue();
11328}
11329
11330/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11331/// followed by unpack 256-bit.
11332static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
11333 ArrayRef<int> Mask, SDValue V1,
11334 SDValue V2, SelectionDAG &DAG) {
11335 SmallVector<int, 32> Unpckl, Unpckh;
11336 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11337 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11338
11339 unsigned UnpackOpcode;
11340 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11341 UnpackOpcode = X86ISD::UNPCKL;
11342 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11343 UnpackOpcode = X86ISD::UNPCKH;
11344 else
11345 return SDValue();
11346
11347 // This is a "natural" unpack operation (rather than the 128-bit sectored
11348 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11349 // input in order to use the x86 instruction.
11350 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11351 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11352 V1 = DAG.getBitcast(VT, V1);
11353 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11354}
11355
11356// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11357// source into the lower elements and zeroing the upper elements.
11358static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11359 ArrayRef<int> Mask, const APInt &Zeroable,
11360 const X86Subtarget &Subtarget) {
11361 if (!VT.is512BitVector() && !Subtarget.hasVLX())
11362 return false;
11363
11364 unsigned NumElts = Mask.size();
11365 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11366 unsigned MaxScale = 64 / EltSizeInBits;
11367
11368 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11369 unsigned SrcEltBits = EltSizeInBits * Scale;
11370 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11371 continue;
11372 unsigned NumSrcElts = NumElts / Scale;
11373 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11374 continue;
11375 unsigned UpperElts = NumElts - NumSrcElts;
11376 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11377 continue;
11378 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11379 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11380 DstVT = MVT::getIntegerVT(EltSizeInBits);
11381 if ((NumSrcElts * EltSizeInBits) >= 128) {
11382 // ISD::TRUNCATE
11383 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11384 } else {
11385 // X86ISD::VTRUNC
11386 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11387 }
11388 return true;
11389 }
11390
11391 return false;
11392}
11393
11394// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11395// element padding to the final DstVT.
11396static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11397 const X86Subtarget &Subtarget,
11398 SelectionDAG &DAG, bool ZeroUppers) {
11399 MVT SrcVT = Src.getSimpleValueType();
11400 MVT DstSVT = DstVT.getScalarType();
11401 unsigned NumDstElts = DstVT.getVectorNumElements();
11402 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11403 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11404
11405 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11406 return SDValue();
11407
11408 // Perform a direct ISD::TRUNCATE if possible.
11409 if (NumSrcElts == NumDstElts)
11410 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11411
11412 if (NumSrcElts > NumDstElts) {
11413 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11414 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11415 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11416 }
11417
11418 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11419 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11420 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11421 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11422 DstVT.getSizeInBits());
11423 }
11424
11425 // Non-VLX targets must truncate from a 512-bit type, so we need to
11426 // widen, truncate and then possibly extract the original subvector.
11427 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11428 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11429 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11430 }
11431
11432 // Fallback to a X86ISD::VTRUNC, padding if necessary.
11433 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11434 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11435 if (DstVT != TruncVT)
11436 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11437 DstVT.getSizeInBits());
11438 return Trunc;
11439}
11440
11441// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11442//
11443// An example is the following:
11444//
11445// t0: ch = EntryToken
11446// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11447// t25: v4i32 = truncate t2
11448// t41: v8i16 = bitcast t25
11449// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11450// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11451// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11452// t18: v2i64 = bitcast t51
11453//
11454// One can just use a single vpmovdw instruction, without avx512vl we need to
11455// use the zmm variant and extract the lower subvector, padding with zeroes.
11456// TODO: Merge with lowerShuffleAsVTRUNC.
11457static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
11458 SDValue V2, ArrayRef<int> Mask,
11459 const APInt &Zeroable,
11460 const X86Subtarget &Subtarget,
11461 SelectionDAG &DAG) {
11462 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11462, __PRETTY_FUNCTION__))
;
11463 if (!Subtarget.hasAVX512())
11464 return SDValue();
11465
11466 unsigned NumElts = VT.getVectorNumElements();
11467 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11468 unsigned MaxScale = 64 / EltSizeInBits;
11469 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11470 unsigned NumSrcElts = NumElts / Scale;
11471 unsigned UpperElts = NumElts - NumSrcElts;
11472 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11473 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11474 continue;
11475
11476 SDValue Src = V1;
11477 if (!Src.hasOneUse())
11478 return SDValue();
11479
11480 Src = peekThroughOneUseBitcasts(Src);
11481 if (Src.getOpcode() != ISD::TRUNCATE ||
11482 Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
11483 return SDValue();
11484 Src = Src.getOperand(0);
11485
11486 // VPMOVWB is only available with avx512bw.
11487 MVT SrcVT = Src.getSimpleValueType();
11488 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11489 !Subtarget.hasBWI())
11490 return SDValue();
11491
11492 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11493 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11494 }
11495
11496 return SDValue();
11497}
11498
11499// Attempt to match binary shuffle patterns as a truncate.
11500static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
11501 SDValue V2, ArrayRef<int> Mask,
11502 const APInt &Zeroable,
11503 const X86Subtarget &Subtarget,
11504 SelectionDAG &DAG) {
11505 assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unexpected VTRUNC type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11506, __PRETTY_FUNCTION__))
11506 "Unexpected VTRUNC type")(((VT.is128BitVector() || VT.is256BitVector()) && "Unexpected VTRUNC type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11506, __PRETTY_FUNCTION__))
;
11507 if (!Subtarget.hasAVX512())
11508 return SDValue();
11509
11510 unsigned NumElts = VT.getVectorNumElements();
11511 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11512 unsigned MaxScale = 64 / EltSizeInBits;
11513 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11514 // TODO: Support non-BWI VPMOVWB truncations?
11515 unsigned SrcEltBits = EltSizeInBits * Scale;
11516 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11517 continue;
11518
11519 // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
11520 // Bail if the V2 elements are undef.
11521 unsigned NumHalfSrcElts = NumElts / Scale;
11522 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11523 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11524 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11525 continue;
11526
11527 // The elements beyond the truncation must be undef/zero.
11528 unsigned UpperElts = NumElts - NumSrcElts;
11529 if (UpperElts > 0 &&
11530 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11531 continue;
11532 bool UndefUppers =
11533 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11534
11535 // As we're using both sources then we need to concat them together
11536 // and truncate from the double-sized src.
11537 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
11538 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11539
11540 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11541 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11542 Src = DAG.getBitcast(SrcVT, Src);
11543 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11544 }
11545
11546 return SDValue();
11547}
11548
11549/// Check whether a compaction lowering can be done by dropping even
11550/// elements and compute how many times even elements must be dropped.
11551///
11552/// This handles shuffles which take every Nth element where N is a power of
11553/// two. Example shuffle masks:
11554///
11555/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11556/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11557/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11558/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11559/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11560/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11561///
11562/// Any of these lanes can of course be undef.
11563///
11564/// This routine only supports N <= 3.
11565/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11566/// for larger N.
11567///
11568/// \returns N above, or the number of times even elements must be dropped if
11569/// there is such a number. Otherwise returns zero.
11570static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11571 bool IsSingleInput) {
11572 // The modulus for the shuffle vector entries is based on whether this is
11573 // a single input or not.
11574 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11575 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11576, __PRETTY_FUNCTION__))
11576 "We should only be called with masks with a power-of-2 size!")((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11576, __PRETTY_FUNCTION__))
;
11577
11578 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11579
11580 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11581 // and 2^3 simultaneously. This is because we may have ambiguity with
11582 // partially undef inputs.
11583 bool ViableForN[3] = {true, true, true};
11584
11585 for (int i = 0, e = Mask.size(); i < e; ++i) {
11586 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11587 // want.
11588 if (Mask[i] < 0)
11589 continue;
11590
11591 bool IsAnyViable = false;
11592 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11593 if (ViableForN[j]) {
11594 uint64_t N = j + 1;
11595
11596 // The shuffle mask must be equal to (i * 2^N) % M.
11597 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11598 IsAnyViable = true;
11599 else
11600 ViableForN[j] = false;
11601 }
11602 // Early exit if we exhaust the possible powers of two.
11603 if (!IsAnyViable)
11604 break;
11605 }
11606
11607 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11608 if (ViableForN[j])
11609 return j + 1;
11610
11611 // Return 0 as there is no viable power of two.
11612 return 0;
11613}
11614
11615// X86 has dedicated pack instructions that can handle specific truncation
11616// operations: PACKSS and PACKUS.
11617// Checks for compaction shuffle masks if MaxStages > 1.
11618// TODO: Add support for matching multiple PACKSS/PACKUS stages.
11619static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11620 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11621 SelectionDAG &DAG,
11622 const X86Subtarget &Subtarget,
11623 unsigned MaxStages = 1) {
11624 unsigned NumElts = VT.getVectorNumElements();
11625 unsigned BitSize = VT.getScalarSizeInBits();
11626 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&((0 < MaxStages && MaxStages <= 3 && (BitSize
<< MaxStages) <= 64 && "Illegal maximum compaction"
) ? static_cast<void> (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11627, __PRETTY_FUNCTION__))
11627 "Illegal maximum compaction")((0 < MaxStages && MaxStages <= 3 && (BitSize
<< MaxStages) <= 64 && "Illegal maximum compaction"
) ? static_cast<void> (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11627, __PRETTY_FUNCTION__))
;
11628
11629 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11630 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11631 unsigned NumPackedBits = NumSrcBits - BitSize;
11632 SDValue VV1 = DAG.getBitcast(PackVT, N1);
11633 SDValue VV2 = DAG.getBitcast(PackVT, N2);
11634 if (Subtarget.hasSSE41() || BitSize == 8) {
11635 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11636 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
11637 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
11638 V1 = VV1;
11639 V2 = VV2;
11640 SrcVT = PackVT;
11641 PackOpcode = X86ISD::PACKUS;
11642 return true;
11643 }
11644 }
11645 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
11646 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
11647 V1 = VV1;
11648 V2 = VV2;
11649 SrcVT = PackVT;
11650 PackOpcode = X86ISD::PACKSS;
11651 return true;
11652 }
11653 return false;
11654 };
11655
11656 // Attempt to match against wider and wider compaction patterns.
11657 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11658 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11659 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11660
11661 // Try binary shuffle.
11662 SmallVector<int, 32> BinaryMask;
11663 createPackShuffleMask(VT, BinaryMask, false, NumStages);
11664 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
11665 if (MatchPACK(V1, V2, PackVT))
11666 return true;
11667
11668 // Try unary shuffle.
11669 SmallVector<int, 32> UnaryMask;
11670 createPackShuffleMask(VT, UnaryMask, true, NumStages);
11671 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
11672 if (MatchPACK(V1, V1, PackVT))
11673 return true;
11674 }
11675
11676 return false;
11677}
11678
11679static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
11680 SDValue V1, SDValue V2, SelectionDAG &DAG,
11681 const X86Subtarget &Subtarget) {
11682 MVT PackVT;
11683 unsigned PackOpcode;
11684 unsigned SizeBits = VT.getSizeInBits();
11685 unsigned EltBits = VT.getScalarSizeInBits();
11686 unsigned MaxStages = Log2_32(64 / EltBits);
11687 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11688 Subtarget, MaxStages))
11689 return SDValue();
11690
11691 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11692 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11693
11694 // Don't lower multi-stage packs on AVX512, truncation is better.
11695 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11696 return SDValue();
11697
11698 // Pack to the largest type possible:
11699 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11700 unsigned MaxPackBits = 16;
11701 if (CurrentEltBits > 16 &&
11702 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11703 MaxPackBits = 32;
11704
11705 // Repeatedly pack down to the target size.
11706 SDValue Res;
11707 for (unsigned i = 0; i != NumStages; ++i) {
11708 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11709 unsigned NumSrcElts = SizeBits / SrcEltBits;
11710 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11711 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11712 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11713 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11714 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11715 DAG.getBitcast(SrcVT, V2));
11716 V1 = V2 = Res;
11717 CurrentEltBits /= 2;
11718 }
11719 assert(Res && Res.getValueType() == VT &&((Res && Res.getValueType() == VT && "Failed to lower compaction shuffle"
) ? static_cast<void> (0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11720, __PRETTY_FUNCTION__))
11720 "Failed to lower compaction shuffle")((Res && Res.getValueType() == VT && "Failed to lower compaction shuffle"
) ? static_cast<void> (0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11720, __PRETTY_FUNCTION__))
;
11721 return Res;
11722}
11723
11724/// Try to emit a bitmask instruction for a shuffle.
11725///
11726/// This handles cases where we can model a blend exactly as a bitmask due to
11727/// one of the inputs being zeroable.
11728static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
11729 SDValue V2, ArrayRef<int> Mask,
11730 const APInt &Zeroable,
11731 const X86Subtarget &Subtarget,
11732 SelectionDAG &DAG) {
11733 MVT MaskVT = VT;
11734 MVT EltVT = VT.getVectorElementType();
11735 SDValue Zero, AllOnes;
11736 // Use f64 if i64 isn't legal.
11737 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11738 EltVT = MVT::f64;
11739 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11740 }
11741
11742 MVT LogicVT = VT;
11743 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
11744 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11745 APFloat AllOnesValue = APFloat::getAllOnesValue(
11746 SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
11747 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11748 LogicVT =
11749 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
11750 } else {
11751 Zero = DAG.getConstant(0, DL, EltVT);
11752 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11753 }
11754
11755 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11756 SDValue V;
11757 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11758 if (Zeroable[i])
11759 continue;
11760 if (Mask[i] % Size != i)
11761 return SDValue(); // Not a blend.
11762 if (!V)
11763 V = Mask[i] < Size ? V1 : V2;
11764 else if (V != (Mask[i] < Size ? V1 : V2))
11765 return SDValue(); // Can only let one input through the mask.
11766
11767 VMaskOps[i] = AllOnes;
11768 }
11769 if (!V)
11770 return SDValue(); // No non-zeroable elements!
11771
11772 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11773 VMask = DAG.getBitcast(LogicVT, VMask);
11774 V = DAG.getBitcast(LogicVT, V);
11775 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11776 return DAG.getBitcast(VT, And);
11777}
11778
11779/// Try to emit a blend instruction for a shuffle using bit math.
11780///
11781/// This is used as a fallback approach when first class blend instructions are
11782/// unavailable. Currently it is only suitable for integer vectors, but could
11783/// be generalized for floating point vectors if desirable.
11784static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
11785 SDValue V2, ArrayRef<int> Mask,
11786 SelectionDAG &DAG) {
11787 assert(VT.isInteger() && "Only supports integer vector types!")((VT.isInteger() && "Only supports integer vector types!"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11787, __PRETTY_FUNCTION__))
;
11788 MVT EltVT = VT.getVectorElementType();
11789 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11790 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11791 SmallVector<SDValue, 16> MaskOps;
11792 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11793 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11794 return SDValue(); // Shuffled input!
11795 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11796 }
11797
11798 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11799 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
11800 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
11801 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
11802}
11803
11804static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
11805 SDValue PreservedSrc,
11806 const X86Subtarget &Subtarget,
11807 SelectionDAG &DAG);
11808
11809static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
11810 MutableArrayRef<int> Mask,
11811 const APInt &Zeroable, bool &ForceV1Zero,
11812 bool &ForceV2Zero, uint64_t &BlendMask) {
11813 bool V1IsZeroOrUndef =
11814 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
11815 bool V2IsZeroOrUndef =
11816 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
11817
11818 BlendMask = 0;
11819 ForceV1Zero = false, ForceV2Zero = false;
11820 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")((Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11820, __PRETTY_FUNCTION__))
;
11821
11822 // Attempt to generate the binary blend mask. If an input is zero then
11823 // we can use any lane.
11824 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11825 int M = Mask[i];
11826 if (M == SM_SentinelUndef)
11827 continue;
11828 if (M == i)
11829 continue;
11830 if (M == i + Size) {
11831 BlendMask |= 1ull << i;
11832 continue;
11833 }
11834 if (Zeroable[i]) {
11835 if (V1IsZeroOrUndef) {
11836 ForceV1Zero = true;
11837 Mask[i] = i;
11838 continue;
11839 }
11840 if (V2IsZeroOrUndef) {
11841 ForceV2Zero = true;
11842 BlendMask |= 1ull << i;
11843 Mask[i] = i + Size;
11844 continue;
11845 }
11846 }
11847 return false;
11848 }
11849 return true;
11850}
11851
11852static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
11853 int Scale) {
11854 uint64_t ScaledMask = 0;
11855 for (int i = 0; i != Size; ++i)
11856 if (BlendMask & (1ull << i))
11857 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
11858 return ScaledMask;
11859}
11860
11861/// Try to emit a blend instruction for a shuffle.
11862///
11863/// This doesn't do any checks for the availability of instructions for blending
11864/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11865/// be matched in the backend with the type given. What it does check for is
11866/// that the shuffle mask is a blend, or convertible into a blend with zero.
11867static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
11868 SDValue V2, ArrayRef<int> Original,
11869 const APInt &Zeroable,
11870 const X86Subtarget &Subtarget,
11871 SelectionDAG &DAG) {
11872 uint64_t BlendMask = 0;
11873 bool ForceV1Zero = false, ForceV2Zero = false;
11874 SmallVector<int, 64> Mask(Original.begin(), Original.end());
11875 if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11876 BlendMask))
11877 return SDValue();
11878
11879 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11880 if (ForceV1Zero)
11881 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11882 if (ForceV2Zero)
11883 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11884
11885 switch (VT.SimpleTy) {
11886 case MVT::v4i64:
11887 case MVT::v8i32:
11888 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")((Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11888, __PRETTY_FUNCTION__))
;
11889 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11890 case MVT::v4f64:
11891 case MVT::v8f32:
11892 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")((Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11892, __PRETTY_FUNCTION__))
;
11893 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11894 case MVT::v2f64:
11895 case MVT::v2i64:
11896 case MVT::v4f32:
11897 case MVT::v4i32:
11898 case MVT::v8i16:
11899 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")((Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11899, __PRETTY_FUNCTION__))
;
11900 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11901 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11902 case MVT::v16i16: {
11903 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")((Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11903, __PRETTY_FUNCTION__))
;
11904 SmallVector<int, 8> RepeatedMask;
11905 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11906 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11907 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11907, __PRETTY_FUNCTION__))
;
11908 BlendMask = 0;
11909 for (int i = 0; i < 8; ++i)
11910 if (RepeatedMask[i] >= 8)
11911 BlendMask |= 1ull << i;
11912 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11913 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11914 }
11915 // Use PBLENDW for lower/upper lanes and then blend lanes.
11916 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11917 // merge to VSELECT where useful.
11918 uint64_t LoMask = BlendMask & 0xFF;
11919 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11920 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11921 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11922 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11923 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11924 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11925 return DAG.getVectorShuffle(
11926 MVT::v16i16, DL, Lo, Hi,
11927 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11928 }
11929 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11930 }
11931 case MVT::v32i8:
11932 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")((Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11932, __PRETTY_FUNCTION__))
;
11933 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11934 case MVT::v16i8: {
11935 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")((Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11935, __PRETTY_FUNCTION__))
;
11936
11937 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11938 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11939 Subtarget, DAG))
11940 return Masked;
11941
11942 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11943 MVT IntegerType =
11944 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11945 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11946 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11947 }
11948
11949 // If we have VPTERNLOG, we can use that as a bit blend.
11950 if (Subtarget.hasVLX())
11951 if (SDValue BitBlend =
11952 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11953 return BitBlend;
11954
11955 // Scale the blend by the number of bytes per element.
11956 int Scale = VT.getScalarSizeInBits() / 8;
11957
11958 // This form of blend is always done on bytes. Compute the byte vector
11959 // type.
11960 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11961
11962 // x86 allows load folding with blendvb from the 2nd source operand. But
11963 // we are still using LLVM select here (see comment below), so that's V1.
11964 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11965 // allow that load-folding possibility.
11966 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11967 ShuffleVectorSDNode::commuteMask(Mask);
11968 std::swap(V1, V2);
11969 }
11970
11971 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11972 // mix of LLVM's code generator and the x86 backend. We tell the code
11973 // generator that boolean values in the elements of an x86 vector register
11974 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11975 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11976 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11977 // of the element (the remaining are ignored) and 0 in that high bit would
11978 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11979 // the LLVM model for boolean values in vector elements gets the relevant
11980 // bit set, it is set backwards and over constrained relative to x86's
11981 // actual model.
11982 SmallVector<SDValue, 32> VSELECTMask;
11983 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11984 for (int j = 0; j < Scale; ++j)
11985 VSELECTMask.push_back(
11986 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
11987 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
11988 MVT::i8));
11989
11990 V1 = DAG.getBitcast(BlendVT, V1);
11991 V2 = DAG.getBitcast(BlendVT, V2);
11992 return DAG.getBitcast(
11993 VT,
11994 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11995 V1, V2));
11996 }
11997 case MVT::v16f32:
11998 case MVT::v8f64:
11999 case MVT::v8i64:
12000 case MVT::v16i32:
12001 case MVT::v32i16:
12002 case MVT::v64i8: {
12003 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12004 bool OptForSize = DAG.shouldOptForSize();
12005 if (!OptForSize) {
12006 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12007 Subtarget, DAG))
12008 return Masked;
12009 }
12010
12011 // Otherwise load an immediate into a GPR, cast to k-register, and use a
12012 // masked move.
12013 MVT IntegerType =
12014 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12015 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12016 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12017 }
12018 default:
12019 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12019)
;
12020 }
12021}
12022
12023/// Try to lower as a blend of elements from two inputs followed by
12024/// a single-input permutation.
12025///
12026/// This matches the pattern where we can blend elements from two inputs and
12027/// then reduce the shuffle to a single-input permutation.
12028static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12029 SDValue V1, SDValue V2,
12030 ArrayRef<int> Mask,
12031 SelectionDAG &DAG,
12032 bool ImmBlends = false) {
12033 // We build up the blend mask while checking whether a blend is a viable way
12034 // to reduce the shuffle.
12035 SmallVector<int, 32> BlendMask(Mask.size(), -1);
12036 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12037
12038 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12039 if (Mask[i] < 0)
12040 continue;
12041
12042 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")((Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12042, __PRETTY_FUNCTION__))
;
12043
12044 if (BlendMask[Mask[i] % Size] < 0)
12045 BlendMask[Mask[i] % Size] = Mask[i];
12046 else if (BlendMask[Mask[i] % Size] != Mask[i])
12047 return SDValue(); // Can't blend in the needed input!
12048
12049 PermuteMask[i] = Mask[i] % Size;
12050 }
12051
12052 // If only immediate blends, then bail if the blend mask can't be widened to
12053 // i16.
12054 unsigned EltSize = VT.getScalarSizeInBits();
12055 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12056 return SDValue();
12057
12058 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12059 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12060}
12061
12062/// Try to lower as an unpack of elements from two inputs followed by
12063/// a single-input permutation.
12064///
12065/// This matches the pattern where we can unpack elements from two inputs and
12066/// then reduce the shuffle to a single-input (wider) permutation.
12067static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12068 SDValue V1, SDValue V2,
12069 ArrayRef<int> Mask,
12070 SelectionDAG &DAG) {
12071 int NumElts = Mask.size();
12072 int NumLanes = VT.getSizeInBits() / 128;
12073 int NumLaneElts = NumElts / NumLanes;
12074 int NumHalfLaneElts = NumLaneElts / 2;
12075
12076 bool MatchLo = true, MatchHi = true;
12077 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12078
12079 // Determine UNPCKL/UNPCKH type and operand order.
12080 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12081 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12082 int M = Mask[Lane + Elt];
12083 if (M < 0)
12084 continue;
12085
12086 SDValue &Op = Ops[Elt & 1];
12087 if (M < NumElts && (Op.isUndef() || Op == V1))
12088 Op = V1;
12089 else if (NumElts <= M && (Op.isUndef() || Op == V2))
12090 Op = V2;
12091 else
12092 return SDValue();
12093
12094 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12095 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12096 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12097 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12098 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12099 if (!MatchLo && !MatchHi)
12100 return SDValue();
12101 }
12102 }
12103 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? static_cast<void> (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12103, __PRETTY_FUNCTION__))
;
12104
12105 // Now check that each pair of elts come from the same unpack pair
12106 // and set the permute mask based on each pair.
12107 // TODO - Investigate cases where we permute individual elements.
12108 SmallVector<int, 32> PermuteMask(NumElts, -1);
12109 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12110 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12111 int M0 = Mask[Lane + Elt + 0];
12112 int M1 = Mask[Lane + Elt + 1];
12113 if (0 <= M0 && 0 <= M1 &&
12114 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12115 return SDValue();
12116 if (0 <= M0)
12117 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12118 if (0 <= M1)
12119 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12120 }
12121 }
12122
12123 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12124 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12125 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12126}
12127
12128/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12129/// permuting the elements of the result in place.
12130static SDValue lowerShuffleAsByteRotateAndPermute(
12131 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12132 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12133 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12134 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12135 (VT.is512BitVector() && !Subtarget.hasBWI()))
12136 return SDValue();
12137
12138 // We don't currently support lane crossing permutes.
12139 if (is128BitLaneCrossingShuffleMask(VT, Mask))
12140 return SDValue();
12141
12142 int Scale = VT.getScalarSizeInBits() / 8;
12143 int NumLanes = VT.getSizeInBits() / 128;
12144 int NumElts = VT.getVectorNumElements();
12145 int NumEltsPerLane = NumElts / NumLanes;
12146
12147 // Determine range of mask elts.
12148 bool Blend1 = true;
12149 bool Blend2 = true;
12150 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12151 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12152 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12153 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12154 int M = Mask[Lane + Elt];
12155 if (M < 0)
12156 continue;
12157 if (M < NumElts) {
12158 Blend1 &= (M == (Lane + Elt));
12159 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask") ? static_cast<void> (0) : __assert_fail
("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12159, __PRETTY_FUNCTION__))
;
12160 M = M % NumEltsPerLane;
12161 Range1.first = std::min(Range1.first, M);
12162 Range1.second = std::max(Range1.second, M);
12163 } else {
12164 M -= NumElts;
12165 Blend2 &= (M == (Lane + Elt));
12166 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask") ? static_cast<void> (0) : __assert_fail
("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12166, __PRETTY_FUNCTION__))
;
12167 M = M % NumEltsPerLane;
12168 Range2.first = std::min(Range2.first, M);
12169 Range2.second = std::max(Range2.second, M);
12170 }
12171 }
12172 }
12173
12174 // Bail if we don't need both elements.
12175 // TODO - it might be worth doing this for unary shuffles if the permute
12176 // can be widened.
12177 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12178 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12179 return SDValue();
12180
12181 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12182 return SDValue();
12183
12184 // Rotate the 2 ops so we can access both ranges, then permute the result.
12185 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12186 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12187 SDValue Rotate = DAG.getBitcast(
12188 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12189 DAG.getBitcast(ByteVT, Lo),
12190 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12191 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12192 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12193 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12194 int M = Mask[Lane + Elt];
12195 if (M < 0)
12196 continue;
12197 if (M < NumElts)
12198 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12199 else
12200 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12201 }
12202 }
12203 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12204 };
12205
12206 // Check if the ranges are small enough to rotate from either direction.
12207 if (Range2.second < Range1.first)
12208 return RotateAndPermute(V1, V2, Range1.first, 0);
12209 if (Range1.second < Range2.first)
12210 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12211 return SDValue();
12212}
12213
12214/// Generic routine to decompose a shuffle and blend into independent
12215/// blends and permutes.
12216///
12217/// This matches the extremely common pattern for handling combined
12218/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12219/// operations. It will try to pick the best arrangement of shuffles and
12220/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12221static SDValue lowerShuffleAsDecomposedShuffleMerge(
12222 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12223 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12224 int NumElts = Mask.size();
12225 int NumLanes = VT.getSizeInBits() / 128;
12226 int NumEltsPerLane = NumElts / NumLanes;
12227
12228 // Shuffle the input elements into the desired positions in V1 and V2 and
12229 // unpack/blend them together.
12230 bool IsAlternating = true;
12231 SmallVector<int, 32> V1Mask(NumElts, -1);
12232 SmallVector<int, 32> V2Mask(NumElts, -1);
12233 SmallVector<int, 32> FinalMask(NumElts, -1);
12234 for (int i = 0; i < NumElts; ++i) {
12235 int M = Mask[i];
12236 if (M >= 0 && M < NumElts) {
12237 V1Mask[i] = M;
12238 FinalMask[i] = i;
12239 IsAlternating &= (i & 1) == 0;
12240 } else if (M >= NumElts) {
12241 V2Mask[i] = M - NumElts;
12242 FinalMask[i] = i + NumElts;
12243 IsAlternating &= (i & 1) == 1;
12244 }
12245 }
12246
12247 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12248 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12249 // the shuffle may be able to fold with a load or other benefit. However, when
12250 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12251 // pre-shuffle first is a better strategy.
12252 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12253 // Only prefer immediate blends to unpack/rotate.
12254 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12255 DAG, true))
12256 return BlendPerm;
12257 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
12258 DAG))
12259 return UnpackPerm;
12260 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
12261 DL, VT, V1, V2, Mask, Subtarget, DAG))
12262 return RotatePerm;
12263 // Unpack/rotate failed - try again with variable blends.
12264 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12265 DAG))
12266 return BlendPerm;
12267 }
12268
12269 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12270 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12271 // TODO: It doesn't have to be alternating - but each lane mustn't have more
12272 // than half the elements coming from each source.
12273 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12274 V1Mask.assign(NumElts, -1);
12275 V2Mask.assign(NumElts, -1);
12276 FinalMask.assign(NumElts, -1);
12277 for (int i = 0; i != NumElts; i += NumEltsPerLane)
12278 for (int j = 0; j != NumEltsPerLane; ++j) {
12279 int M = Mask[i + j];
12280 if (M >= 0 && M < NumElts) {
12281 V1Mask[i + (j / 2)] = M;
12282 FinalMask[i + j] = i + (j / 2);
12283 } else if (M >= NumElts) {
12284 V2Mask[i + (j / 2)] = M - NumElts;
12285 FinalMask[i + j] = i + (j / 2) + NumElts;
12286 }
12287 }
12288 }
12289
12290 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12291 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12292 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12293}
12294
12295/// Try to lower a vector shuffle as a bit rotation.
12296///
12297/// Look for a repeated rotation pattern in each sub group.
12298/// Returns a ISD::ROTL element rotation amount or -1 if failed.
12299static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
12300 int NumElts = Mask.size();
12301 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(((NumElts % NumSubElts) == 0 && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12301, __PRETTY_FUNCTION__))
;
12302
12303 int RotateAmt = -1;
12304 for (int i = 0; i != NumElts; i += NumSubElts) {
12305 for (int j = 0; j != NumSubElts; ++j) {
12306 int M = Mask[i + j];
12307 if (M < 0)
12308 continue;
12309 if (!isInRange(M, i, i + NumSubElts))
12310 return -1;
12311 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
12312 if (0 <= RotateAmt && Offset != RotateAmt)
12313 return -1;
12314 RotateAmt = Offset;
12315 }
12316 }
12317 return RotateAmt;
12318}
12319
12320static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12321 const X86Subtarget &Subtarget,
12322 ArrayRef<int> Mask) {
12323 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12323, __PRETTY_FUNCTION__))
;
12324 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")((EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? static_cast<void> (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12324, __PRETTY_FUNCTION__))
;
12325
12326 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12327 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12328 int MaxSubElts = 64 / EltSizeInBits;
12329 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
12330 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
12331 if (RotateAmt < 0)
12332 continue;
12333
12334 int NumElts = Mask.size();
12335 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12336 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12337 return RotateAmt * EltSizeInBits;
12338 }
12339
12340 return -1;
12341}
12342
12343/// Lower shuffle using X86ISD::VROTLI rotations.
12344static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
12345 ArrayRef<int> Mask,
12346 const X86Subtarget &Subtarget,
12347 SelectionDAG &DAG) {
12348 // Only XOP + AVX512 targets have bit rotation instructions.
12349 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12350 bool IsLegal =
12351 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12352 if (!IsLegal && Subtarget.hasSSE3())
12353 return SDValue();
12354
12355 MVT RotateVT;
12356 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12357 Subtarget, Mask);
12358 if (RotateAmt < 0)
12359 return SDValue();
12360
12361 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12362 // expanded to OR(SRL,SHL), will be more efficient, but if they can
12363 // widen to vXi16 or more then existing lowering should will be better.
12364 if (!IsLegal) {
12365 if ((RotateAmt % 16) == 0)
12366 return SDValue();
12367 // TODO: Use getTargetVShiftByConstNode.
12368 unsigned ShlAmt = RotateAmt;
12369 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12370 V1 = DAG.getBitcast(RotateVT, V1);
12371 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12372 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12373 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12374 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12375 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12376 return DAG.getBitcast(VT, Rot);
12377 }
12378
12379 SDValue Rot =
12380 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12381 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12382 return DAG.getBitcast(VT, Rot);
12383}
12384
12385/// Try to match a vector shuffle as an element rotation.
12386///
12387/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12388static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
12389 ArrayRef<int> Mask) {
12390 int NumElts = Mask.size();
12391
12392 // We need to detect various ways of spelling a rotation:
12393 // [11, 12, 13, 14, 15, 0, 1, 2]
12394 // [-1, 12, 13, 14, -1, -1, 1, -1]
12395 // [-1, -1, -1, -1, -1, -1, 1, 2]
12396 // [ 3, 4, 5, 6, 7, 8, 9, 10]
12397 // [-1, 4, 5, 6, -1, -1, 9, -1]
12398 // [-1, 4, 5, 6, -1, -1, -1, -1]
12399 int Rotation = 0;
12400 SDValue Lo, Hi;
12401 for (int i = 0; i < NumElts; ++i) {
12402 int M = Mask[i];
12403 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts
))) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12404, __PRETTY_FUNCTION__))
12404 "Unexpected mask index.")(((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts
))) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12404, __PRETTY_FUNCTION__))
;
12405 if (M < 0)
12406 continue;
12407
12408 // Determine where a rotated vector would have started.
12409 int StartIdx = i - (M % NumElts);
12410 if (StartIdx == 0)
12411 // The identity rotation isn't interesting, stop.
12412 return -1;
12413
12414 // If we found the tail of a vector the rotation must be the missing
12415 // front. If we found the head of a vector, it must be how much of the
12416 // head.
12417 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12418
12419 if (Rotation == 0)
12420 Rotation = CandidateRotation;
12421 else if (Rotation != CandidateRotation)
12422 // The rotations don't match, so we can't match this mask.
12423 return -1;
12424
12425 // Compute which value this mask is pointing at.
12426 SDValue MaskV = M < NumElts ? V1 : V2;
12427
12428 // Compute which of the two target values this index should be assigned
12429 // to. This reflects whether the high elements are remaining or the low
12430 // elements are remaining.
12431 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12432
12433 // Either set up this value if we've not encountered it before, or check
12434 // that it remains consistent.
12435 if (!TargetV)
12436 TargetV = MaskV;
12437 else if (TargetV != MaskV)
12438 // This may be a rotation, but it pulls from the inputs in some
12439 // unsupported interleaving.
12440 return -1;
12441 }
12442
12443 // Check that we successfully analyzed the mask, and normalize the results.
12444 assert(Rotation != 0 && "Failed to locate a viable rotation!")((Rotation != 0 && "Failed to locate a viable rotation!"
) ? static_cast<void> (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12444, __PRETTY_FUNCTION__))
;
12445 assert((Lo || Hi) && "Failed to find a rotated input vector!")(((Lo || Hi) && "Failed to find a rotated input vector!"
) ? static_cast<void> (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12445, __PRETTY_FUNCTION__))
;
12446 if (!Lo)
12447 Lo = Hi;
12448 else if (!Hi)
12449 Hi = Lo;
12450
12451 V1 = Lo;
12452 V2 = Hi;
12453
12454 return Rotation;
12455}
12456
12457/// Try to lower a vector shuffle as a byte rotation.
12458///
12459/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12460/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12461/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12462/// try to generically lower a vector shuffle through such an pattern. It
12463/// does not check for the profitability of lowering either as PALIGNR or
12464/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12465/// This matches shuffle vectors that look like:
12466///
12467/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12468///
12469/// Essentially it concatenates V1 and V2, shifts right by some number of
12470/// elements, and takes the low elements as the result. Note that while this is
12471/// specified as a *right shift* because x86 is little-endian, it is a *left
12472/// rotate* of the vector lanes.
12473static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
12474 ArrayRef<int> Mask) {
12475 // Don't accept any shuffles with zero elements.
12476 if (isAnyZero(Mask))
12477 return -1;
12478
12479 // PALIGNR works on 128-bit lanes.
12480 SmallVector<int, 16> RepeatedMask;
12481 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12482 return -1;
12483
12484 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12485 if (Rotation <= 0)
12486 return -1;
12487
12488 // PALIGNR rotates bytes, so we need to scale the
12489 // rotation based on how many bytes are in the vector lane.
12490 int NumElts = RepeatedMask.size();
12491 int Scale = 16 / NumElts;
12492 return Rotation * Scale;
12493}
12494
12495static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
12496 SDValue V2, ArrayRef<int> Mask,
12497 const X86Subtarget &Subtarget,
12498 SelectionDAG &DAG) {
12499 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12499, __PRETTY_FUNCTION__))
;
12500
12501 SDValue Lo = V1, Hi = V2;
12502 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12503 if (ByteRotation <= 0)
12504 return SDValue();
12505
12506 // Cast the inputs to i8 vector of correct length to match PALIGNR or
12507 // PSLLDQ/PSRLDQ.
12508 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12509 Lo = DAG.getBitcast(ByteVT, Lo);
12510 Hi = DAG.getBitcast(ByteVT, Hi);
12511
12512 // SSSE3 targets can use the palignr instruction.
12513 if (Subtarget.hasSSSE3()) {
12514 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12515, __PRETTY_FUNCTION__))
12515 "512-bit PALIGNR requires BWI instructions")(((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12515, __PRETTY_FUNCTION__))
;
12516 return DAG.getBitcast(
12517 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12518 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12519 }
12520
12521 assert(VT.is128BitVector() &&((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12522, __PRETTY_FUNCTION__))
12522 "Rotate-based lowering only supports 128-bit lowering!")((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12522, __PRETTY_FUNCTION__))
;
12523 assert(Mask.size() <= 16 &&((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12524, __PRETTY_FUNCTION__))
12524 "Can shuffle at most 16 bytes in a 128-bit vector!")((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12524, __PRETTY_FUNCTION__))
;
12525 assert(ByteVT == MVT::v16i8 &&((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12526, __PRETTY_FUNCTION__))
12526 "SSE2 rotate lowering only needed for v16i8!")((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12526, __PRETTY_FUNCTION__))
;
12527
12528 // Default SSE2 implementation
12529 int LoByteShift = 16 - ByteRotation;
12530 int HiByteShift = ByteRotation;
12531
12532 SDValue LoShift =
12533 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12534 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12535 SDValue HiShift =
12536 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12537 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12538 return DAG.getBitcast(VT,
12539 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12540}
12541
12542/// Try to lower a vector shuffle as a dword/qword rotation.
12543///
12544/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12545/// rotation of the concatenation of two vectors; This routine will
12546/// try to generically lower a vector shuffle through such an pattern.
12547///
12548/// Essentially it concatenates V1 and V2, shifts right by some number of
12549/// elements, and takes the low elements as the result. Note that while this is
12550/// specified as a *right shift* because x86 is little-endian, it is a *left
12551/// rotate* of the vector lanes.
12552static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
12553 SDValue V2, ArrayRef<int> Mask,
12554 const X86Subtarget &Subtarget,
12555 SelectionDAG &DAG) {
12556 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12557, __PRETTY_FUNCTION__))
12557 "Only 32-bit and 64-bit elements are supported!")(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12557, __PRETTY_FUNCTION__))
;
12558
12559 // 128/256-bit vectors are only supported with VLX.
12560 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT
.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12561, __PRETTY_FUNCTION__))
12561 && "VLX required for 128/256-bit vectors")(((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT
.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12561, __PRETTY_FUNCTION__))
;
12562
12563 SDValue Lo = V1, Hi = V2;
12564 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12565 if (Rotation <= 0)
12566 return SDValue();
12567
12568 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12569 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12570}
12571
12572/// Try to lower a vector shuffle as a byte shift sequence.
12573static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
12574 SDValue V2, ArrayRef<int> Mask,
12575 const APInt &Zeroable,
12576 const X86Subtarget &Subtarget,
12577 SelectionDAG &DAG) {
12578 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12578, __PRETTY_FUNCTION__))
;
12579 assert(VT.is128BitVector() && "Only 128-bit vectors supported")((VT.is128BitVector() && "Only 128-bit vectors supported"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12579, __PRETTY_FUNCTION__))
;
12580
12581 // We need a shuffle that has zeros at one/both ends and a sequential
12582 // shuffle from one source within.
12583 unsigned ZeroLo = Zeroable.countTrailingOnes();
12584 unsigned ZeroHi = Zeroable.countLeadingOnes();
12585 if (!ZeroLo && !ZeroHi)
12586 return SDValue();
12587
12588 unsigned NumElts = Mask.size();
12589 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12590 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12591 return SDValue();
12592
12593 unsigned Scale = VT.getScalarSizeInBits() / 8;
12594 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12595 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12596 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12597 return SDValue();
12598
12599 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12600 Res = DAG.getBitcast(MVT::v16i8, Res);
12601
12602 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12603 // inner sequential set of elements, possibly offset:
12604 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12605 // 01234567 --> 4567zzzz --> zzzzz456
12606 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12607 if (ZeroLo == 0) {
12608 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12609 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12610 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12611 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12612 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12613 } else if (ZeroHi == 0) {
12614 unsigned Shift = Mask[ZeroLo] % NumElts;
12615 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12616 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12617 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12618 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12619 } else if (!Subtarget.hasSSSE3()) {
12620 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12621 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12622 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12623 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12624 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12625 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12626 Shift += Mask[ZeroLo] % NumElts;
12627 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12628 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12629 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12630 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12631 } else
12632 return SDValue();
12633
12634 return DAG.getBitcast(VT, Res);
12635}
12636
12637/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12638///
12639/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12640/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12641/// matches elements from one of the input vectors shuffled to the left or
12642/// right with zeroable elements 'shifted in'. It handles both the strictly
12643/// bit-wise element shifts and the byte shift across an entire 128-bit double
12644/// quad word lane.
12645///
12646/// PSHL : (little-endian) left bit shift.
12647/// [ zz, 0, zz, 2 ]
12648/// [ -1, 4, zz, -1 ]
12649/// PSRL : (little-endian) right bit shift.
12650/// [ 1, zz, 3, zz]
12651/// [ -1, -1, 7, zz]
12652/// PSLLDQ : (little-endian) left byte shift
12653/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12654/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12655/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12656/// PSRLDQ : (little-endian) right byte shift
12657/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12658/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12659/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12660static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12661 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12662 int MaskOffset, const APInt &Zeroable,
12663 const X86Subtarget &Subtarget) {
12664 int Size = Mask.size();
12665 unsigned SizeInBits = Size * ScalarSizeInBits;
12666
12667 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12668 for (int i = 0; i < Size; i += Scale)
12669 for (int j = 0; j < Shift; ++j)
12670 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12671 return false;
12672
12673 return true;
12674 };
12675
12676 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12677 for (int i = 0; i != Size; i += Scale) {
12678 unsigned Pos = Left ? i + Shift : i;
12679 unsigned Low = Left ? i : i + Shift;
12680 unsigned Len = Scale - Shift;
12681 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12682 return -1;
12683 }
12684
12685 int ShiftEltBits = ScalarSizeInBits * Scale;
12686 bool ByteShift = ShiftEltBits > 64;
12687 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12688 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12689 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12690
12691 // Normalize the scale for byte shifts to still produce an i64 element
12692 // type.
12693 Scale = ByteShift ? Scale / 2 : Scale;
12694
12695 // We need to round trip through the appropriate type for the shift.
12696 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12697 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12698 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12699 return (int)ShiftAmt;
12700 };
12701
12702 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12703 // keep doubling the size of the integer elements up to that. We can
12704 // then shift the elements of the integer vector by whole multiples of
12705 // their width within the elements of the larger integer vector. Test each
12706 // multiple to see if we can find a match with the moved element indices
12707 // and that the shifted in elements are all zeroable.
12708 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12709 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12710 for (int Shift = 1; Shift != Scale; ++Shift)
12711 for (bool Left : {true, false})
12712 if (CheckZeros(Shift, Scale, Left)) {
12713 int ShiftAmt = MatchShift(Shift, Scale, Left);
12714 if (0 < ShiftAmt)
12715 return ShiftAmt;
12716 }
12717
12718 // no match
12719 return -1;
12720}
12721
12722static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
12723 SDValue V2, ArrayRef<int> Mask,
12724 const APInt &Zeroable,
12725 const X86Subtarget &Subtarget,
12726 SelectionDAG &DAG) {
12727 int Size = Mask.size();
12728 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12728, __PRETTY_FUNCTION__))
;
12729
12730 MVT ShiftVT;
12731 SDValue V = V1;
12732 unsigned Opcode;
12733
12734 // Try to match shuffle against V1 shift.
12735 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12736 Mask, 0, Zeroable, Subtarget);
12737
12738 // If V1 failed, try to match shuffle against V2 shift.
12739 if (ShiftAmt < 0) {
12740 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12741 Mask, Size, Zeroable, Subtarget);
12742 V = V2;
12743 }
12744
12745 if (ShiftAmt < 0)
12746 return SDValue();
12747
12748 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type") ? static_cast<void> (0)
: __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12749, __PRETTY_FUNCTION__))
12749 "Illegal integer vector type")((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type") ? static_cast<void> (0)
: __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12749, __PRETTY_FUNCTION__))
;
12750 V = DAG.getBitcast(ShiftVT, V);
12751 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12752 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12753 return DAG.getBitcast(VT, V);
12754}
12755
12756// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12757// Remainder of lower half result is zero and upper half is all undef.
12758static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12759 ArrayRef<int> Mask, uint64_t &BitLen,
12760 uint64_t &BitIdx, const APInt &Zeroable) {
12761 int Size = Mask.size();
12762 int HalfSize = Size / 2;
12763 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12763, __PRETTY_FUNCTION__))
;
12764 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")((!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("!Zeroable.isAllOnesValue() && \"Fully zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12764, __PRETTY_FUNCTION__))
;
12765
12766 // Upper half must be undefined.
12767 if (!isUndefUpperHalf(Mask))
12768 return false;
12769
12770 // Determine the extraction length from the part of the
12771 // lower half that isn't zeroable.
12772 int Len = HalfSize;
12773 for (; Len > 0; --Len)
12774 if (!Zeroable[Len - 1])
12775 break;
12776 assert(Len > 0 && "Zeroable shuffle mask")((Len > 0 && "Zeroable shuffle mask") ? static_cast
<void> (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12776, __PRETTY_FUNCTION__))
;
12777
12778 // Attempt to match first Len sequential elements from the lower half.
12779 SDValue Src;
12780 int Idx = -1;
12781 for (int i = 0; i != Len; ++i) {
12782 int M = Mask[i];
12783 if (M == SM_SentinelUndef)
12784 continue;
12785 SDValue &V = (M < Size ? V1 : V2);
12786 M = M % Size;
12787
12788 // The extracted elements must start at a valid index and all mask
12789 // elements must be in the lower half.
12790 if (i > M || M >= HalfSize)
12791 return false;
12792
12793 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12794 Src = V;
12795 Idx = M - i;
12796 continue;
12797 }
12798 return false;
12799 }
12800
12801 if (!Src || Idx < 0)
12802 return false;
12803
12804 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(((Idx + Len) <= HalfSize && "Illegal extraction mask"
) ? static_cast<void> (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12804, __PRETTY_FUNCTION__))
;
12805 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12806 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12807 V1 = Src;
12808 return true;
12809}
12810
12811// INSERTQ: Extract lowest Len elements from lower half of second source and
12812// insert over first source, starting at Idx.
12813// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12814static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12815 ArrayRef<int> Mask, uint64_t &BitLen,
12816 uint64_t &BitIdx) {
12817 int Size = Mask.size();
12818 int HalfSize = Size / 2;
12819 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12819, __PRETTY_FUNCTION__))
;
12820
12821 // Upper half must be undefined.
12822 if (!isUndefUpperHalf(Mask))
12823 return false;
12824
12825 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12826 SDValue Base;
12827
12828 // Attempt to match first source from mask before insertion point.
12829 if (isUndefInRange(Mask, 0, Idx)) {
12830 /* EMPTY */
12831 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12832 Base = V1;
12833 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12834 Base = V2;
12835 } else {
12836 continue;
12837 }
12838
12839 // Extend the extraction length looking to match both the insertion of
12840 // the second source and the remaining elements of the first.
12841 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12842 SDValue Insert;
12843 int Len = Hi - Idx;
12844
12845 // Match insertion.
12846 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12847 Insert = V1;
12848 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12849 Insert = V2;
12850 } else {
12851 continue;
12852 }
12853
12854 // Match the remaining elements of the lower half.
12855 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12856 /* EMPTY */
12857 } else if ((!Base || (Base == V1)) &&
12858 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12859 Base = V1;
12860 } else if ((!Base || (Base == V2)) &&
12861 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12862 Size + Hi)) {
12863 Base = V2;
12864 } else {
12865 continue;
12866 }
12867
12868 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12869 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12870 V1 = Base;
12871 V2 = Insert;
12872 return true;
12873 }
12874 }
12875
12876 return false;
12877}
12878
12879/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12880static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
12881 SDValue V2, ArrayRef<int> Mask,
12882 const APInt &Zeroable, SelectionDAG &DAG) {
12883 uint64_t BitLen, BitIdx;
12884 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12885 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12886 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12887 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12888
12889 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12890 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12891 V2 ? V2 : DAG.getUNDEF(VT),
12892 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12893 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12894
12895 return SDValue();
12896}
12897
12898/// Lower a vector shuffle as a zero or any extension.
12899///
12900/// Given a specific number of elements, element bit width, and extension
12901/// stride, produce either a zero or any extension based on the available
12902/// features of the subtarget. The extended elements are consecutive and
12903/// begin and can start from an offsetted element index in the input; to
12904/// avoid excess shuffling the offset must either being in the bottom lane
12905/// or at the start of a higher lane. All extended elements must be from
12906/// the same lane.
12907static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
12908 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
12909 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12910 assert(Scale > 1 && "Need a scale to extend.")((Scale > 1 && "Need a scale to extend.") ? static_cast
<void> (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12910, __PRETTY_FUNCTION__))
;
12911 int EltBits = VT.getScalarSizeInBits();
12912 int NumElements = VT.getVectorNumElements();
12913 int NumEltsPerLane = 128 / EltBits;
12914 int OffsetLane = Offset / NumEltsPerLane;
12915 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12916, __PRETTY_FUNCTION__))
12916 "Only 8, 16, and 32 bit elements can be extended.")(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12916, __PRETTY_FUNCTION__))
;
12917 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")((Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."
) ? static_cast<void> (0) : __assert_fail ("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12917, __PRETTY_FUNCTION__))
;
12918 assert(0 <= Offset && "Extension offset must be positive.")((0 <= Offset && "Extension offset must be positive."
) ? static_cast<void> (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12918, __PRETTY_FUNCTION__))
;
12919 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0
) && "Extension offset must be in the first lane or start an upper lane."
) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12920, __PRETTY_FUNCTION__))
12920 "Extension offset must be in the first lane or start an upper lane.")(((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0
) && "Extension offset must be in the first lane or start an upper lane."
) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12920, __PRETTY_FUNCTION__))
;
12921
12922 // Check that an index is in same lane as the base offset.
12923 auto SafeOffset = [&](int Idx) {
12924 return OffsetLane == (Idx / NumEltsPerLane);
12925 };
12926
12927 // Shift along an input so that the offset base moves to the first element.
12928 auto ShuffleOffset = [&](SDValue V) {
12929 if (!Offset)
12930 return V;
12931
12932 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12933 for (int i = 0; i * Scale < NumElements; ++i) {
12934 int SrcIdx = i + Offset;
12935 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12936 }
12937 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12938 };
12939
12940 // Found a valid a/zext mask! Try various lowering strategies based on the
12941 // input type and available ISA extensions.
12942 if (Subtarget.hasSSE41()) {
12943 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12944 // PUNPCK will catch this in a later shuffle match.
12945 if (Offset && Scale == 2 && VT.is128BitVector())
12946 return SDValue();
12947 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12948 NumElements / Scale);
12949 InputV = ShuffleOffset(InputV);
12950 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
12951 DL, ExtVT, InputV, DAG);
12952 return DAG.getBitcast(VT, InputV);
12953 }
12954
12955 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")((VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12955, __PRETTY_FUNCTION__))
;
12956
12957 // For any extends we can cheat for larger element sizes and use shuffle
12958 // instructions that can fold with a load and/or copy.
12959 if (AnyExt && EltBits == 32) {
12960 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12961 -1};
12962 return DAG.getBitcast(
12963 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12964 DAG.getBitcast(MVT::v4i32, InputV),
12965 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12966 }
12967 if (AnyExt && EltBits == 16 && Scale > 2) {
12968 int PSHUFDMask[4] = {Offset / 2, -1,
12969 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12970 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12971 DAG.getBitcast(MVT::v4i32, InputV),
12972 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12973 int PSHUFWMask[4] = {1, -1, -1, -1};
12974 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12975 return DAG.getBitcast(
12976 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12977 DAG.getBitcast(MVT::v8i16, InputV),
12978 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12979 }
12980
12981 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12982 // to 64-bits.
12983 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12984 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")((NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12984, __PRETTY_FUNCTION__))
;
12985 assert(VT.is128BitVector() && "Unexpected vector width!")((VT.is128BitVector() && "Unexpected vector width!") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12985, __PRETTY_FUNCTION__))
;
12986
12987 int LoIdx = Offset * EltBits;
12988 SDValue Lo = DAG.getBitcast(
12989 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12990 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12991 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12992
12993 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12994 return DAG.getBitcast(VT, Lo);
12995
12996 int HiIdx = (Offset + 1) * EltBits;
12997 SDValue Hi = DAG.getBitcast(
12998 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12999 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13000 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13001 return DAG.getBitcast(VT,
13002 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13003 }
13004
13005 // If this would require more than 2 unpack instructions to expand, use
13006 // pshufb when available. We can only use more than 2 unpack instructions
13007 // when zero extending i8 elements which also makes it easier to use pshufb.
13008 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13009 assert(NumElements == 16 && "Unexpected byte vector width!")((NumElements == 16 && "Unexpected byte vector width!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13009, __PRETTY_FUNCTION__))
;
13010 SDValue PSHUFBMask[16];
13011 for (int i = 0; i < 16; ++i) {
13012 int Idx = Offset + (i / Scale);
13013 if ((i % Scale == 0 && SafeOffset(Idx))) {
13014 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13015 continue;
13016 }
13017 PSHUFBMask[i] =
13018 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13019 }
13020 InputV = DAG.getBitcast(MVT::v16i8, InputV);
13021 return DAG.getBitcast(
13022 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13023 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13024 }
13025
13026 // If we are extending from an offset, ensure we start on a boundary that
13027 // we can unpack from.
13028 int AlignToUnpack = Offset % (NumElements / Scale);
13029 if (AlignToUnpack) {
13030 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13031 for (int i = AlignToUnpack; i < NumElements; ++i)
13032 ShMask[i - AlignToUnpack] = i;
13033 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13034 Offset -= AlignToUnpack;
13035 }
13036
13037 // Otherwise emit a sequence of unpacks.
13038 do {
13039 unsigned UnpackLoHi = X86ISD::UNPCKL;
13040 if (Offset >= (NumElements / 2)) {
13041 UnpackLoHi = X86ISD::UNPCKH;
13042 Offset -= (NumElements / 2);
13043 }
13044
13045 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13046 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13047 : getZeroVector(InputVT, Subtarget, DAG, DL);
13048 InputV = DAG.getBitcast(InputVT, InputV);
13049 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13050 Scale /= 2;
13051 EltBits *= 2;
13052 NumElements /= 2;
13053 } while (Scale > 1);
13054 return DAG.getBitcast(VT, InputV);
13055}
13056
13057/// Try to lower a vector shuffle as a zero extension on any microarch.
13058///
13059/// This routine will try to do everything in its power to cleverly lower
13060/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13061/// check for the profitability of this lowering, it tries to aggressively
13062/// match this pattern. It will use all of the micro-architectural details it
13063/// can to emit an efficient lowering. It handles both blends with all-zero
13064/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13065/// masking out later).
13066///
13067/// The reason we have dedicated lowering for zext-style shuffles is that they
13068/// are both incredibly common and often quite performance sensitive.
13069static SDValue lowerShuffleAsZeroOrAnyExtend(
13070 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13071 const APInt &Zeroable, const X86Subtarget &Subtarget,
13072 SelectionDAG &DAG) {
13073 int Bits = VT.getSizeInBits();
13074 int NumLanes = Bits / 128;
13075 int NumElements = VT.getVectorNumElements();
13076 int NumEltsPerLane = NumElements / NumLanes;
13077 assert(VT.getScalarSizeInBits() <= 32 &&((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13078, __PRETTY_FUNCTION__))
13078 "Exceeds 32-bit integer zero extension limit")((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13078, __PRETTY_FUNCTION__))
;
13079 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(((int)Mask.size() == NumElements && "Unexpected shuffle mask size"
) ? static_cast<void> (0) : __assert_fail ("(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13079, __PRETTY_FUNCTION__))
;
13080
13081 // Define a helper function to check a particular ext-scale and lower to it if
13082 // valid.
13083 auto Lower = [&](int Scale) -> SDValue {
13084 SDValue InputV;
13085 bool AnyExt = true;
13086 int Offset = 0;
13087 int Matches = 0;
13088 for (int i = 0; i < NumElements; ++i) {
13089 int M = Mask[i];
13090 if (M < 0)
13091 continue; // Valid anywhere but doesn't tell us anything.
13092 if (i % Scale != 0) {
13093 // Each of the extended elements need to be zeroable.
13094 if (!Zeroable[i])
13095 return SDValue();
13096
13097 // We no longer are in the anyext case.
13098 AnyExt = false;
13099 continue;
13100 }
13101
13102 // Each of the base elements needs to be consecutive indices into the
13103 // same input vector.
13104 SDValue V = M < NumElements ? V1 : V2;
13105 M = M % NumElements;
13106 if (!InputV) {
13107 InputV = V;
13108 Offset = M - (i / Scale);
13109 } else if (InputV != V)
13110 return SDValue(); // Flip-flopping inputs.
13111
13112 // Offset must start in the lowest 128-bit lane or at the start of an
13113 // upper lane.
13114 // FIXME: Is it ever worth allowing a negative base offset?
13115 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13116 (Offset % NumEltsPerLane) == 0))
13117 return SDValue();
13118
13119 // If we are offsetting, all referenced entries must come from the same
13120 // lane.
13121 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13122 return SDValue();
13123
13124 if ((M % NumElements) != (Offset + (i / Scale)))
13125 return SDValue(); // Non-consecutive strided elements.
13126 Matches++;
13127 }
13128
13129 // If we fail to find an input, we have a zero-shuffle which should always
13130 // have already been handled.
13131 // FIXME: Maybe handle this here in case during blending we end up with one?
13132 if (!InputV)
13133 return SDValue();
13134
13135 // If we are offsetting, don't extend if we only match a single input, we
13136 // can always do better by using a basic PSHUF or PUNPCK.
13137 if (Offset != 0 && Matches < 2)
13138 return SDValue();
13139
13140 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13141 InputV, Mask, Subtarget, DAG);
13142 };
13143
13144 // The widest scale possible for extending is to a 64-bit integer.
13145 assert(Bits % 64 == 0 &&((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13146, __PRETTY_FUNCTION__))
13146 "The number of bits in a vector must be divisible by 64 on x86!")((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13146, __PRETTY_FUNCTION__))
;
13147 int NumExtElements = Bits / 64;
13148
13149 // Each iteration, try extending the elements half as much, but into twice as
13150 // many elements.
13151 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13152 assert(NumElements % NumExtElements == 0 &&((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13153, __PRETTY_FUNCTION__))
13153 "The input vector size must be divisible by the extended size.")((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13153, __PRETTY_FUNCTION__))
;
13154 if (SDValue V = Lower(NumElements / NumExtElements))
13155 return V;
13156 }
13157
13158 // General extends failed, but 128-bit vectors may be able to use MOVQ.
13159 if (Bits != 128)
13160 return SDValue();
13161
13162 // Returns one of the source operands if the shuffle can be reduced to a
13163 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13164 auto CanZExtLowHalf = [&]() {
13165 for (int i = NumElements / 2; i != NumElements; ++i)
13166 if (!Zeroable[i])
13167 return SDValue();
13168 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13169 return V1;
13170 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13171 return V2;
13172 return SDValue();
13173 };
13174
13175 if (SDValue V = CanZExtLowHalf()) {
13176 V = DAG.getBitcast(MVT::v2i64, V);
13177 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13178 return DAG.getBitcast(VT, V);
13179 }
13180
13181 // No viable ext lowering found.
13182 return SDValue();
13183}
13184
13185/// Try to get a scalar value for a specific element of a vector.
13186///
13187/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13188static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13189 SelectionDAG &DAG) {
13190 MVT VT = V.getSimpleValueType();
13191 MVT EltVT = VT.getVectorElementType();
13192 V = peekThroughBitcasts(V);
13193
13194 // If the bitcasts shift the element size, we can't extract an equivalent
13195 // element from it.
13196 MVT NewVT = V.getSimpleValueType();
13197 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13198 return SDValue();
13199
13200 if (V.getOpcode() == ISD::BUILD_VECTOR ||
13201 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13202 // Ensure the scalar operand is the same size as the destination.
13203 // FIXME: Add support for scalar truncation where possible.
13204 SDValue S = V.getOperand(Idx);
13205 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13206 return DAG.getBitcast(EltVT, S);
13207 }
13208
13209 return SDValue();
13210}
13211
13212/// Helper to test for a load that can be folded with x86 shuffles.
13213///
13214/// This is particularly important because the set of instructions varies
13215/// significantly based on whether the operand is a load or not.
13216static bool isShuffleFoldableLoad(SDValue V) {
13217 V = peekThroughBitcasts(V);
13218 return ISD::isNON_EXTLoad(V.getNode());
13219}
13220
13221/// Try to lower insertion of a single element into a zero vector.
13222///
13223/// This is a common pattern that we have especially efficient patterns to lower
13224/// across all subtarget feature sets.
13225static SDValue lowerShuffleAsElementInsertion(
13226 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13227 const APInt &Zeroable, const X86Subtarget &Subtarget,
13228 SelectionDAG &DAG) {
13229 MVT ExtVT = VT;
13230 MVT EltVT = VT.getVectorElementType();
13231
13232 int V2Index =
13233 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13234 Mask.begin();
13235 bool IsV1Zeroable = true;
13236 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13237 if (i != V2Index && !Zeroable[i]) {
13238 IsV1Zeroable = false;
13239 break;
13240 }
13241
13242 // Check for a single input from a SCALAR_TO_VECTOR node.
13243 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13244 // all the smarts here sunk into that routine. However, the current
13245 // lowering of BUILD_VECTOR makes that nearly impossible until the old
13246 // vector shuffle lowering is dead.
13247 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13248 DAG);
13249 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13250 // We need to zext the scalar if it is smaller than an i32.
13251 V2S = DAG.getBitcast(EltVT, V2S);
13252 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
13253 // Using zext to expand a narrow element won't work for non-zero
13254 // insertions.
13255 if (!IsV1Zeroable)
13256 return SDValue();
13257
13258 // Zero-extend directly to i32.
13259 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13260 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13261 }
13262 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13263 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13264 EltVT == MVT::i16) {
13265 // Either not inserting from the low element of the input or the input
13266 // element size is too small to use VZEXT_MOVL to clear the high bits.
13267 return SDValue();
13268 }
13269
13270 if (!IsV1Zeroable) {
13271 // If V1 can't be treated as a zero vector we have fewer options to lower
13272 // this. We can't support integer vectors or non-zero targets cheaply, and
13273 // the V1 elements can't be permuted in any way.
13274 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")((VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? static_cast<void> (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13274, __PRETTY_FUNCTION__))
;
13275 if (!VT.isFloatingPoint() || V2Index != 0)
13276 return SDValue();
13277 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
13278 V1Mask[V2Index] = -1;
13279 if (!isNoopShuffleMask(V1Mask))
13280 return SDValue();
13281 if (!VT.is128BitVector())
13282 return SDValue();
13283
13284 // Otherwise, use MOVSD or MOVSS.
13285 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13286, __PRETTY_FUNCTION__))
13286 "Only two types of floating point element types to handle!")(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13286, __PRETTY_FUNCTION__))
;
13287 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
13288 ExtVT, V1, V2);
13289 }
13290
13291 // This lowering only works for the low element with floating point vectors.
13292 if (VT.isFloatingPoint() && V2Index != 0)
13293 return SDValue();
13294
13295 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13296 if (ExtVT != VT)
13297 V2 = DAG.getBitcast(VT, V2);
13298
13299 if (V2Index != 0) {
13300 // If we have 4 or fewer lanes we can cheaply shuffle the element into
13301 // the desired position. Otherwise it is more efficient to do a vector
13302 // shift left. We know that we can do a vector shift left because all
13303 // the inputs are zero.
13304 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
13305 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13306 V2Shuffle[V2Index] = 0;
13307 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13308 } else {
13309 V2 = DAG.getBitcast(MVT::v16i8, V2);
13310 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13311 DAG.getTargetConstant(
13312 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
13313 V2 = DAG.getBitcast(VT, V2);
13314 }
13315 }
13316 return V2;
13317}
13318
13319/// Try to lower broadcast of a single - truncated - integer element,
13320/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13321///
13322/// This assumes we have AVX2.
13323static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
13324 int BroadcastIdx,
13325 const X86Subtarget &Subtarget,
13326 SelectionDAG &DAG) {
13327 assert(Subtarget.hasAVX2() &&((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13328, __PRETTY_FUNCTION__))
13328 "We can only lower integer broadcasts with AVX2!")((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13328, __PRETTY_FUNCTION__))
;
13329
13330 MVT EltVT = VT.getVectorElementType();
13331 MVT V0VT = V0.getSimpleValueType();
13332
13333 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")((VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13333, __PRETTY_FUNCTION__))
;
13334 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")((V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? static_cast<void> (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13334, __PRETTY_FUNCTION__))
;
13335
13336 MVT V0EltVT = V0VT.getVectorElementType();
13337 if (!V0EltVT.isInteger())
13338 return SDValue();
13339
13340 const unsigned EltSize = EltVT.getSizeInBits();
13341 const unsigned V0EltSize = V0EltVT.getSizeInBits();
13342
13343 // This is only a truncation if the original element type is larger.
13344 if (V0EltSize <= EltSize)
13345 return SDValue();
13346
13347 assert(((V0EltSize % EltSize) == 0) &&((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"
) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13348, __PRETTY_FUNCTION__))
13348 "Scalar type sizes must all be powers of 2 on x86!")((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"
) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13348, __PRETTY_FUNCTION__))
;
13349
13350 const unsigned V0Opc = V0.getOpcode();
13351 const unsigned Scale = V0EltSize / EltSize;
13352 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13353
13354 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13355 V0Opc != ISD::BUILD_VECTOR)
13356 return SDValue();
13357
13358 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13359
13360 // If we're extracting non-least-significant bits, shift so we can truncate.
13361 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13362 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13363 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13364 if (const int OffsetIdx = BroadcastIdx % Scale)
13365 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13366 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13367
13368 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13369 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13370}
13371
13372/// Test whether this can be lowered with a single SHUFPS instruction.
13373///
13374/// This is used to disable more specialized lowerings when the shufps lowering
13375/// will happen to be efficient.
13376static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
13377 // This routine only handles 128-bit shufps.
13378 assert(Mask.size() == 4 && "Unsupported mask size!")((Mask.size() == 4 && "Unsupported mask size!") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13378, __PRETTY_FUNCTION__))
;
13379 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13379, __PRETTY_FUNCTION__))
;
13380 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13380, __PRETTY_FUNCTION__))
;
13381 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13381, __PRETTY_FUNCTION__))
;
13382 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13382, __PRETTY_FUNCTION__))
;
13383
13384 // To lower with a single SHUFPS we need to have the low half and high half
13385 // each requiring a single input.
13386 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13387 return false;
13388 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13389 return false;
13390
13391 return true;
13392}
13393
13394/// If we are extracting two 128-bit halves of a vector and shuffling the
13395/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13396/// multi-shuffle lowering.
13397static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
13398 SDValue N1, ArrayRef<int> Mask,
13399 SelectionDAG &DAG) {
13400 MVT VT = N0.getSimpleValueType();
13401 assert((VT.is128BitVector() &&(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13403, __PRETTY_FUNCTION__))
13402 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13403, __PRETTY_FUNCTION__))
13403 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13403, __PRETTY_FUNCTION__))
;
13404
13405 // Check that both sources are extracts of the same source vector.
13406 if (!N0.hasOneUse() || !N1.hasOneUse() ||
13407 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13408 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13409 N0.getOperand(0) != N1.getOperand(0))
13410 return SDValue();
13411
13412 SDValue WideVec = N0.getOperand(0);
13413 MVT WideVT = WideVec.getSimpleValueType();
13414 if (!WideVT.is256BitVector())
13415 return SDValue();
13416
13417 // Match extracts of each half of the wide source vector. Commute the shuffle
13418 // if the extract of the low half is N1.
13419 unsigned NumElts = VT.getVectorNumElements();
13420 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13421 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13422 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13423 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13424 ShuffleVectorSDNode::commuteMask(NewMask);
13425 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13426 return SDValue();
13427
13428 // Final bailout: if the mask is simple, we are better off using an extract
13429 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13430 // because that avoids a constant load from memory.
13431 if (NumElts == 4 &&
13432 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
13433 return SDValue();
13434
13435 // Extend the shuffle mask with undef elements.
13436 NewMask.append(NumElts, -1);
13437
13438 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13439 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13440 NewMask);
13441 // This is free: ymm -> xmm.
13442 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13443 DAG.getIntPtrConstant(0, DL));
13444}
13445
13446/// Try to lower broadcast of a single element.
13447///
13448/// For convenience, this code also bundles all of the subtarget feature set
13449/// filtering. While a little annoying to re-dispatch on type here, there isn't
13450/// a convenient way to factor it out.
13451static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
13452 SDValue V2, ArrayRef<int> Mask,
13453 const X86Subtarget &Subtarget,
13454 SelectionDAG &DAG) {
13455 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13456 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
13457 (Subtarget.hasAVX2() && VT.isInteger())))
13458 return SDValue();
13459
13460 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13461 // we can only broadcast from a register with AVX2.
13462 unsigned NumEltBits = VT.getScalarSizeInBits();
13463 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13464 ? X86ISD::MOVDDUP
13465 : X86ISD::VBROADCAST;
13466 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13467
13468 // Check that the mask is a broadcast.
13469 int BroadcastIdx = getSplatIndex(Mask);
13470 if (BroadcastIdx < 0)
13471 return SDValue();
13472 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13474, __PRETTY_FUNCTION__))
13473 "a sorted mask where the broadcast "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13474, __PRETTY_FUNCTION__))
13474 "comes from V1.")((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13474, __PRETTY_FUNCTION__))
;
13475
13476 // Go up the chain of (vector) values to find a scalar load that we can
13477 // combine with the broadcast.
13478 // TODO: Combine this logic with findEltLoadSrc() used by
13479 // EltsFromConsecutiveLoads().
13480 int BitOffset = BroadcastIdx * NumEltBits;
13481 SDValue V = V1;
13482 for (;;) {
13483 switch (V.getOpcode()) {
13484 case ISD::BITCAST: {
13485 V = V.getOperand(0);
13486 continue;
13487 }
13488 case ISD::CONCAT_VECTORS: {
13489 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13490 int OpIdx = BitOffset / OpBitWidth;
13491 V = V.getOperand(OpIdx);
13492 BitOffset %= OpBitWidth;
13493 continue;
13494 }
13495 case ISD::EXTRACT_SUBVECTOR: {
13496 // The extraction index adds to the existing offset.
13497 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13498 unsigned Idx = V.getConstantOperandVal(1);
13499 unsigned BeginOffset = Idx * EltBitWidth;
13500 BitOffset += BeginOffset;
13501 V = V.getOperand(0);
13502 continue;
13503 }
13504 case ISD::INSERT_SUBVECTOR: {
13505 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13506 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13507 int Idx = (int)V.getConstantOperandVal(2);
13508 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13509 int BeginOffset = Idx * EltBitWidth;
13510 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13511 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13512 BitOffset -= BeginOffset;
13513 V = VInner;
13514 } else {
13515 V = VOuter;
13516 }
13517 continue;
13518 }
13519 }
13520 break;
13521 }
13522 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13522, __PRETTY_FUNCTION__))
;
13523 BroadcastIdx = BitOffset / NumEltBits;
13524
13525 // Do we need to bitcast the source to retrieve the original broadcast index?
13526 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13527
13528 // Check if this is a broadcast of a scalar. We special case lowering
13529 // for scalars so that we can more effectively fold with loads.
13530 // If the original value has a larger element type than the shuffle, the
13531 // broadcast element is in essence truncated. Make that explicit to ease
13532 // folding.
13533 if (BitCastSrc && VT.isInteger())
13534 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13535 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13536 return TruncBroadcast;
13537
13538 // Also check the simpler case, where we can directly reuse the scalar.
13539 if (!BitCastSrc &&
13540 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13541 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13542 V = V.getOperand(BroadcastIdx);
13543
13544 // If we can't broadcast from a register, check that the input is a load.
13545 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13546 return SDValue();
13547 } else if (ISD::isNormalLoad(V.getNode()) &&
13548 cast<LoadSDNode>(V)->isSimple()) {
13549 // We do not check for one-use of the vector load because a broadcast load
13550 // is expected to be a win for code size, register pressure, and possibly
13551 // uops even if the original vector load is not eliminated.
13552
13553 // Reduce the vector load and shuffle to a broadcasted scalar load.
13554 LoadSDNode *Ld = cast<LoadSDNode>(V);
13555 SDValue BaseAddr = Ld->getOperand(1);
13556 MVT SVT = VT.getScalarType();
13557 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13558 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13558, __PRETTY_FUNCTION__))
;
13559 SDValue NewAddr =
13560 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
13561
13562 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13563 // than MOVDDUP.
13564 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13565 if (Opcode == X86ISD::VBROADCAST) {
13566 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13567 SDValue Ops[] = {Ld->getChain(), NewAddr};
13568 V = DAG.getMemIntrinsicNode(
13569 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13570 DAG.getMachineFunction().getMachineMemOperand(
13571 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13572 DAG.makeEquivalentMemoryOrdering(Ld, V);
13573 return DAG.getBitcast(VT, V);
13574 }
13575 assert(SVT == MVT::f64 && "Unexpected VT!")((SVT == MVT::f64 && "Unexpected VT!") ? static_cast<
void> (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13575, __PRETTY_FUNCTION__))
;
13576 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13577 DAG.getMachineFunction().getMachineMemOperand(
13578 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13579 DAG.makeEquivalentMemoryOrdering(Ld, V);
13580 } else if (!BroadcastFromReg) {
13581 // We can't broadcast from a vector register.
13582 return SDValue();
13583 } else if (BitOffset != 0) {
13584 // We can only broadcast from the zero-element of a vector register,
13585 // but it can be advantageous to broadcast from the zero-element of a
13586 // subvector.
13587 if (!VT.is256BitVector() && !VT.is512BitVector())
13588 return SDValue();
13589
13590 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13591 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13592 return SDValue();
13593
13594 // Only broadcast the zero-element of a 128-bit subvector.
13595 if ((BitOffset % 128) != 0)
13596 return SDValue();
13597
13598 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13599, __PRETTY_FUNCTION__))
13599 "Unexpected bit-offset")(((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13599, __PRETTY_FUNCTION__))
;
13600 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() ==
512) && "Unexpected vector size") ? static_cast<void
> (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13601, __PRETTY_FUNCTION__))
13601 "Unexpected vector size")(((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() ==
512) && "Unexpected vector size") ? static_cast<void
> (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13601, __PRETTY_FUNCTION__))
;
13602 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13603 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13604 }
13605
13606 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
13607 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
13608 DAG.getBitcast(MVT::f64, V));
13609
13610 // If this is a scalar, do the broadcast on this type and bitcast.
13611 if (!V.getValueType().isVector()) {
13612 assert(V.getScalarValueSizeInBits() == NumEltBits &&((V.getScalarValueSizeInBits() == NumEltBits && "Unexpected scalar size"
) ? static_cast<void> (0) : __assert_fail ("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13613, __PRETTY_FUNCTION__))
13613 "Unexpected scalar size")((V.getScalarValueSizeInBits() == NumEltBits && "Unexpected scalar size"
) ? static_cast<void> (0) : __assert_fail ("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13613, __PRETTY_FUNCTION__))
;
13614 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13615 VT.getVectorNumElements());
13616 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13617 }
13618
13619 // We only support broadcasting from 128-bit vectors to minimize the
13620 // number of patterns we need to deal with in isel. So extract down to
13621 // 128-bits, removing as many bitcasts as possible.
13622 if (V.getValueSizeInBits() > 128)
13623 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
13624
13625 // Otherwise cast V to a vector with the same element type as VT, but
13626 // possibly narrower than VT. Then perform the broadcast.
13627 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13628 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13629 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13630}
13631
13632// Check for whether we can use INSERTPS to perform the shuffle. We only use
13633// INSERTPS when the V1 elements are already in the correct locations
13634// because otherwise we can just always use two SHUFPS instructions which
13635// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13636// perform INSERTPS if a single V1 element is out of place and all V2
13637// elements are zeroable.
13638static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
13639 unsigned &InsertPSMask,
13640 const APInt &Zeroable,
13641 ArrayRef<int> Mask, SelectionDAG &DAG) {
13642 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")((V1.getSimpleValueType().is128BitVector() && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13642, __PRETTY_FUNCTION__))
;
13643 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")((V2.getSimpleValueType().is128BitVector() && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13643, __PRETTY_FUNCTION__))
;
13644 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13644, __PRETTY_FUNCTION__))
;
13645
13646 // Attempt to match INSERTPS with one element from VA or VB being
13647 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13648 // are updated.
13649 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13650 ArrayRef<int> CandidateMask) {
13651 unsigned ZMask = 0;
13652 int VADstIndex = -1;
13653 int VBDstIndex = -1;
13654 bool VAUsedInPlace = false;
13655
13656 for (int i = 0; i < 4; ++i) {
13657 // Synthesize a zero mask from the zeroable elements (includes undefs).
13658 if (Zeroable[i]) {
13659 ZMask |= 1 << i;
13660 continue;
13661 }
13662
13663 // Flag if we use any VA inputs in place.
13664 if (i == CandidateMask[i]) {
13665 VAUsedInPlace = true;
13666 continue;
13667 }
13668
13669 // We can only insert a single non-zeroable element.
13670 if (VADstIndex >= 0 || VBDstIndex >= 0)
13671 return false;
13672
13673 if (CandidateMask[i] < 4) {
13674 // VA input out of place for insertion.
13675 VADstIndex = i;
13676 } else {
13677 // VB input for insertion.
13678 VBDstIndex = i;
13679 }
13680 }
13681
13682 // Don't bother if we have no (non-zeroable) element for insertion.
13683 if (VADstIndex < 0 && VBDstIndex < 0)
13684 return false;
13685
13686 // Determine element insertion src/dst indices. The src index is from the
13687 // start of the inserted vector, not the start of the concatenated vector.
13688 unsigned VBSrcIndex = 0;
13689 if (VADstIndex >= 0) {
13690 // If we have a VA input out of place, we use VA as the V2 element
13691 // insertion and don't use the original V2 at all.
13692 VBSrcIndex = CandidateMask[VADstIndex];
13693 VBDstIndex = VADstIndex;
13694 VB = VA;
13695 } else {
13696 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13697 }
13698
13699 // If no V1 inputs are used in place, then the result is created only from
13700 // the zero mask and the V2 insertion - so remove V1 dependency.
13701 if (!VAUsedInPlace)
13702 VA = DAG.getUNDEF(MVT::v4f32);
13703
13704 // Update V1, V2 and InsertPSMask accordingly.
13705 V1 = VA;
13706 V2 = VB;
13707
13708 // Insert the V2 element into the desired position.
13709 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13710 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13710, __PRETTY_FUNCTION__))
;
13711 return true;
13712 };
13713
13714 if (matchAsInsertPS(V1, V2, Mask))
13715 return true;
13716
13717 // Commute and try again.
13718 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
13719 ShuffleVectorSDNode::commuteMask(CommutedMask);
13720 if (matchAsInsertPS(V2, V1, CommutedMask))
13721 return true;
13722
13723 return false;
13724}
13725
13726static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
13727 ArrayRef<int> Mask, const APInt &Zeroable,
13728 SelectionDAG &DAG) {
13729 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13729, __PRETTY_FUNCTION__))
;
13730 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13730, __PRETTY_FUNCTION__))
;
13731
13732 // Attempt to match the insertps pattern.
13733 unsigned InsertPSMask;
13734 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13735 return SDValue();
13736
13737 // Insert the V2 element into the desired position.
13738 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13739 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13740}
13741
13742/// Try to lower a shuffle as a permute of the inputs followed by an
13743/// UNPCK instruction.
13744///
13745/// This specifically targets cases where we end up with alternating between
13746/// the two inputs, and so can permute them into something that feeds a single
13747/// UNPCK instruction. Note that this routine only targets integer vectors
13748/// because for floating point vectors we have a generalized SHUFPS lowering
13749/// strategy that handles everything that doesn't *exactly* match an unpack,
13750/// making this clever lowering unnecessary.
13751static SDValue lowerShuffleAsPermuteAndUnpack(
13752 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13753 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13754 assert(!VT.isFloatingPoint() &&((!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13755, __PRETTY_FUNCTION__))
13755 "This routine only supports integer vectors.")((!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13755, __PRETTY_FUNCTION__))
;
13756 assert(VT.is128BitVector() &&((VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13757, __PRETTY_FUNCTION__))
13757 "This routine only works on 128-bit vectors.")((VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13757, __PRETTY_FUNCTION__))
;
13758 assert(!V2.isUndef() &&((!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13759, __PRETTY_FUNCTION__))
13759 "This routine should only be used when blending two inputs.")((!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13759, __PRETTY_FUNCTION__))
;
13760 assert(Mask.size() >= 2 && "Single element masks are invalid.")((Mask.size() >= 2 && "Single element masks are invalid."
) ? static_cast<void> (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13760, __PRETTY_FUNCTION__))
;
13761
13762 int Size = Mask.size();
13763
13764 int NumLoInputs =
13765 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13766 int NumHiInputs =
13767 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13768
13769 bool UnpackLo = NumLoInputs >= NumHiInputs;
13770
13771 auto TryUnpack = [&](int ScalarSize, int Scale) {
13772 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13773 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13774
13775 for (int i = 0; i < Size; ++i) {
13776 if (Mask[i] < 0)
13777 continue;
13778
13779 // Each element of the unpack contains Scale elements from this mask.
13780 int UnpackIdx = i / Scale;
13781
13782 // We only handle the case where V1 feeds the first slots of the unpack.
13783 // We rely on canonicalization to ensure this is the case.
13784 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
13785 return SDValue();
13786
13787 // Setup the mask for this input. The indexing is tricky as we have to
13788 // handle the unpack stride.
13789 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
13790 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
13791 Mask[i] % Size;
13792 }
13793
13794 // If we will have to shuffle both inputs to use the unpack, check whether
13795 // we can just unpack first and shuffle the result. If so, skip this unpack.
13796 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
13797 !isNoopShuffleMask(V2Mask))
13798 return SDValue();
13799
13800 // Shuffle the inputs into place.
13801 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13802 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13803
13804 // Cast the inputs to the type we will use to unpack them.
13805 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
13806 V1 = DAG.getBitcast(UnpackVT, V1);
13807 V2 = DAG.getBitcast(UnpackVT, V2);
13808
13809 // Unpack the inputs and cast the result back to the desired type.
13810 return DAG.getBitcast(
13811 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13812 UnpackVT, V1, V2));
13813 };
13814
13815 // We try each unpack from the largest to the smallest to try and find one
13816 // that fits this mask.
13817 int OrigScalarSize = VT.getScalarSizeInBits();
13818 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
13819 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
13820 return Unpack;
13821
13822 // If we're shuffling with a zero vector then we're better off not doing
13823 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
13824 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
13825 ISD::isBuildVectorAllZeros(V2.getNode()))
13826 return SDValue();
13827
13828 // If none of the unpack-rooted lowerings worked (or were profitable) try an
13829 // initial unpack.
13830 if (NumLoInputs == 0 || NumHiInputs == 0) {
13831 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"
) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13832, __PRETTY_FUNCTION__))
13832 "We have to have *some* inputs!")(((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"
) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13832, __PRETTY_FUNCTION__))
;
13833 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
13834
13835 // FIXME: We could consider the total complexity of the permute of each
13836 // possible unpacking. Or at the least we should consider how many
13837 // half-crossings are created.
13838 // FIXME: We could consider commuting the unpacks.
13839
13840 SmallVector<int, 32> PermMask((unsigned)Size, -1);
13841 for (int i = 0; i < Size; ++i) {
13842 if (Mask[i] < 0)
13843 continue;
13844
13845 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")((Mask[i] % Size >= HalfOffset && "Found input from wrong half!"
) ? static_cast<void> (0) : __assert_fail ("Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13845, __PRETTY_FUNCTION__))
;
13846
13847 PermMask[i] =
13848 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
13849 }
13850 return DAG.getVectorShuffle(
13851 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
13852 DL, VT, V1, V2),
13853 DAG.getUNDEF(VT), PermMask);
13854 }
13855
13856 return SDValue();
13857}
13858
13859/// Handle lowering of 2-lane 64-bit floating point shuffles.
13860///
13861/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13862/// support for floating point shuffles but not integer shuffles. These
13863/// instructions will incur a domain crossing penalty on some chips though so
13864/// it is better to avoid lowering through this for integer vectors where
13865/// possible.
13866static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13867 const APInt &Zeroable, SDValue V1, SDValue V2,
13868 const X86Subtarget &Subtarget,
13869 SelectionDAG &DAG) {
13870 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13870, __PRETTY_FUNCTION__))
;
13871 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13871, __PRETTY_FUNCTION__))
;
13872 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13872, __PRETTY_FUNCTION__))
;
13873
13874 if (V2.isUndef()) {
13875 // Check for being able to broadcast a single element.
13876 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13877 Mask, Subtarget, DAG))
13878 return Broadcast;
13879
13880 // Straight shuffle of a single input vector. Simulate this by using the
13881 // single input as both of the "inputs" to this instruction..
13882 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13883
13884 if (Subtarget.hasAVX()) {
13885 // If we have AVX, we can use VPERMILPS which will allow folding a load
13886 // into the shuffle.
13887 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13888 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13889 }
13890
13891 return DAG.getNode(
13892 X86ISD::SHUFP, DL, MVT::v2f64,
13893 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13894 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13895 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13896 }
13897 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")((Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13897, __PRETTY_FUNCTION__))
;
13898 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")((Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13898, __PRETTY_FUNCTION__))
;
13899 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((Mask[0] < 2 && "We sort V1 to be the first input."
) ? static_cast<void> (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13899, __PRETTY_FUNCTION__))
;
13900 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((Mask[1] >= 2 && "We sort V2 to be the second input."
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13900, __PRETTY_FUNCTION__))
;
13901
13902 if (Subtarget.hasAVX2())
13903 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13904 return Extract;
13905
13906 // When loading a scalar and then shuffling it into a vector we can often do
13907 // the insertion cheaply.
13908 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13909 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13910 return Insertion;
13911 // Try inverting the insertion since for v2 masks it is easy to do and we
13912 // can't reliably sort the mask one way or the other.
13913 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13914 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13915 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13916 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13917 return Insertion;
13918
13919 // Try to use one of the special instruction patterns to handle two common
13920 // blend patterns if a zero-blend above didn't work.
13921 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13922 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13923 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13924 // We can either use a special instruction to load over the low double or
13925 // to move just the low double.
13926 return DAG.getNode(
13927 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13928 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13929
13930 if (Subtarget.hasSSE41())
13931 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13932 Zeroable, Subtarget, DAG))
13933 return Blend;
13934
13935 // Use dedicated unpack instructions for masks that match their pattern.
13936 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
13937 return V;
13938
13939 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13940 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13941 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13942}
13943
13944/// Handle lowering of 2-lane 64-bit integer shuffles.
13945///
13946/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13947/// the integer unit to minimize domain crossing penalties. However, for blends
13948/// it falls back to the floating point shuffle operation with appropriate bit
13949/// casting.
13950static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13951 const APInt &Zeroable, SDValue V1, SDValue V2,
13952 const X86Subtarget &Subtarget,
13953 SelectionDAG &DAG) {
13954 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13954, __PRETTY_FUNCTION__))
;
13955 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13955, __PRETTY_FUNCTION__))
;
13956 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13956, __PRETTY_FUNCTION__))
;
13957
13958 if (V2.isUndef()) {
13959 // Check for being able to broadcast a single element.
13960 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13961 Mask, Subtarget, DAG))
13962 return Broadcast;
13963
13964 // Straight shuffle of a single input vector. For everything from SSE2
13965 // onward this has a single fast instruction with no scary immediates.
13966 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13967 V1 = DAG.getBitcast(MVT::v4i32, V1);
13968 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13969 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13970 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13971 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13972 return DAG.getBitcast(
13973 MVT::v2i64,
13974 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13975 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13976 }
13977 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13977, __PRETTY_FUNCTION__))
;
13978 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13978, __PRETTY_FUNCTION__))
;
13979 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((Mask[0] < 2 && "We sort V1 to be the first input."
) ? static_cast<void> (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13979, __PRETTY_FUNCTION__))
;
13980 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((Mask[1] >= 2 && "We sort V2 to be the second input."
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13980, __PRETTY_FUNCTION__))
;
13981
13982 if (Subtarget.hasAVX2())
13983 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13984 return Extract;
13985
13986 // Try to use shift instructions.
13987 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
13988 Zeroable, Subtarget, DAG))
13989 return Shift;
13990
13991 // When loading a scalar and then shuffling it into a vector we can often do
13992 // the insertion cheaply.
13993 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13994 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13995 return Insertion;
13996 // Try inverting the insertion since for v2 masks it is easy to do and we
13997 // can't reliably sort the mask one way or the other.
13998 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13999 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14000 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14001 return Insertion;
14002
14003 // We have different paths for blend lowering, but they all must use the
14004 // *exact* same predicate.
14005 bool IsBlendSupported = Subtarget.hasSSE41();
14006 if (IsBlendSupported)
14007 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14008 Zeroable, Subtarget, DAG))
14009 return Blend;
14010
14011 // Use dedicated unpack instructions for masks that match their pattern.
14012 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14013 return V;
14014
14015 // Try to use byte rotation instructions.
14016 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14017 if (Subtarget.hasSSSE3()) {
14018 if (Subtarget.hasVLX())
14019 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14020 Subtarget, DAG))
14021 return Rotate;
14022
14023 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14024 Subtarget, DAG))
14025 return Rotate;
14026 }
14027
14028 // If we have direct support for blends, we should lower by decomposing into
14029 // a permute. That will be faster than the domain cross.
14030 if (IsBlendSupported)
14031 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14032 Subtarget, DAG);
14033
14034 // We implement this with SHUFPD which is pretty lame because it will likely
14035 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14036 // However, all the alternatives are still more cycles and newer chips don't
14037 // have this problem. It would be really nice if x86 had better shuffles here.
14038 V1 = DAG.getBitcast(MVT::v2f64, V1);
14039 V2 = DAG.getBitcast(MVT::v2f64, V2);
14040 return DAG.getBitcast(MVT::v2i64,
14041 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14042}
14043
14044/// Lower a vector shuffle using the SHUFPS instruction.
14045///
14046/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14047/// It makes no assumptions about whether this is the *best* lowering, it simply
14048/// uses it.
14049static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14050 ArrayRef<int> Mask, SDValue V1,
14051 SDValue V2, SelectionDAG &DAG) {
14052 SDValue LowV = V1, HighV = V2;
14053 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14054 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14055
14056 if (NumV2Elements == 1) {
14057 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14058
14059 // Compute the index adjacent to V2Index and in the same half by toggling
14060 // the low bit.
14061 int V2AdjIndex = V2Index ^ 1;
14062
14063 if (Mask[V2AdjIndex] < 0) {
14064 // Handles all the cases where we have a single V2 element and an undef.
14065 // This will only ever happen in the high lanes because we commute the
14066 // vector otherwise.
14067 if (V2Index < 2)
14068 std::swap(LowV, HighV);
14069 NewMask[V2Index] -= 4;
14070 } else {
14071 // Handle the case where the V2 element ends up adjacent to a V1 element.
14072 // To make this work, blend them together as the first step.
14073 int V1Index = V2AdjIndex;
14074 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14075 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14076 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14077
14078 // Now proceed to reconstruct the final blend as we have the necessary
14079 // high or low half formed.
14080 if (V2Index < 2) {
14081 LowV = V2;
14082 HighV = V1;
14083 } else {
14084 HighV = V2;
14085 }
14086 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14087 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14088 }
14089 } else if (NumV2Elements == 2) {
14090 if (Mask[0] < 4 && Mask[1] < 4) {
14091 // Handle the easy case where we have V1 in the low lanes and V2 in the
14092 // high lanes.
14093 NewMask[2] -= 4;
14094 NewMask[3] -= 4;
14095 } else if (Mask[2] < 4 && Mask[3] < 4) {
14096 // We also handle the reversed case because this utility may get called
14097 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14098 // arrange things in the right direction.
14099 NewMask[0] -= 4;
14100 NewMask[1] -= 4;
14101 HighV = V1;
14102 LowV = V2;
14103 } else {
14104 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14105 // trying to place elements directly, just blend them and set up the final
14106 // shuffle to place them.
14107
14108 // The first two blend mask elements are for V1, the second two are for
14109 // V2.
14110 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14111 Mask[2] < 4 ? Mask[2] : Mask[3],
14112 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14113 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14114 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14115 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14116
14117 // Now we do a normal shuffle of V1 by giving V1 as both operands to
14118 // a blend.
14119 LowV = HighV = V1;
14120 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14121 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14122 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14123 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14124 }
14125 } else if (NumV2Elements == 3) {
14126 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14127 // we can get here due to other paths (e.g repeated mask matching) that we
14128 // don't want to do another round of lowerVECTOR_SHUFFLE.
14129 ShuffleVectorSDNode::commuteMask(NewMask);
14130 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14131 }
14132 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14133 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14134}
14135
14136/// Lower 4-lane 32-bit floating point shuffles.
14137///
14138/// Uses instructions exclusively from the floating point unit to minimize
14139/// domain crossing penalties, as these are sufficient to implement all v4f32
14140/// shuffles.
14141static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14142 const APInt &Zeroable, SDValue V1, SDValue V2,
14143 const X86Subtarget &Subtarget,
14144 SelectionDAG &DAG) {
14145 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14145, __PRETTY_FUNCTION__))
;
14146 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14146, __PRETTY_FUNCTION__))
;
14147 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14147, __PRETTY_FUNCTION__))
;
14148
14149 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14150
14151 if (NumV2Elements == 0) {
14152 // Check for being able to broadcast a single element.
14153 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14154 Mask, Subtarget, DAG))
14155 return Broadcast;
14156
14157 // Use even/odd duplicate instructions for masks that match their pattern.
14158 if (Subtarget.hasSSE3()) {
14159 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14160 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14161 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14162 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14163 }
14164
14165 if (Subtarget.hasAVX()) {
14166 // If we have AVX, we can use VPERMILPS which will allow folding a load
14167 // into the shuffle.
14168 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14169 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14170 }
14171
14172 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14173 // in SSE1 because otherwise they are widened to v2f64 and never get here.
14174 if (!Subtarget.hasSSE2()) {
14175 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14176 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14177 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14178 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14179 }
14180
14181 // Otherwise, use a straight shuffle of a single input vector. We pass the
14182 // input vector to both operands to simulate this with a SHUFPS.
14183 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14184 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14185 }
14186
14187 if (Subtarget.hasAVX2())
14188 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14189 return Extract;
14190
14191 // There are special ways we can lower some single-element blends. However, we
14192 // have custom ways we can lower more complex single-element blends below that
14193 // we defer to if both this and BLENDPS fail to match, so restrict this to
14194 // when the V2 input is targeting element 0 of the mask -- that is the fast
14195 // case here.
14196 if (NumV2Elements == 1 && Mask[0] >= 4)
14197 if (SDValue V = lowerShuffleAsElementInsertion(
14198 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14199 return V;
14200
14201 if (Subtarget.hasSSE41()) {
14202 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14203 Zeroable, Subtarget, DAG))
14204 return Blend;
14205
14206 // Use INSERTPS if we can complete the shuffle efficiently.
14207 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14208 return V;
14209
14210 if (!isSingleSHUFPSMask(Mask))
14211 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14212 V2, Mask, DAG))
14213 return BlendPerm;
14214 }
14215
14216 // Use low/high mov instructions. These are only valid in SSE1 because
14217 // otherwise they are widened to v2f64 and never get here.
14218 if (!Subtarget.hasSSE2()) {
14219 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14220 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14221 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14222 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14223 }
14224
14225 // Use dedicated unpack instructions for masks that match their pattern.
14226 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
14227 return V;
14228
14229 // Otherwise fall back to a SHUFPS lowering strategy.
14230 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14231}
14232
14233/// Lower 4-lane i32 vector shuffles.
14234///
14235/// We try to handle these with integer-domain shuffles where we can, but for
14236/// blends we use the floating point domain blend instructions.
14237static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14238 const APInt &Zeroable, SDValue V1, SDValue V2,
14239 const X86Subtarget &Subtarget,
14240 SelectionDAG &DAG) {
14241 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14241, __PRETTY_FUNCTION__))
;
14242 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14242, __PRETTY_FUNCTION__))
;
14243 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14243, __PRETTY_FUNCTION__))
;
14244
14245 // Whenever we can lower this as a zext, that instruction is strictly faster
14246 // than any alternative. It also allows us to fold memory operands into the
14247 // shuffle in many cases.
14248 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14249 Zeroable, Subtarget, DAG))
14250 return ZExt;
14251
14252 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14253
14254 if (NumV2Elements == 0) {
14255 // Try to use broadcast unless the mask only has one non-undef element.
14256 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14257 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14258 Mask, Subtarget, DAG))
14259 return Broadcast;
14260 }
14261
14262 // Straight shuffle of a single input vector. For everything from SSE2
14263 // onward this has a single fast instruction with no scary immediates.
14264 // We coerce the shuffle pattern to be compatible with UNPCK instructions
14265 // but we aren't actually going to use the UNPCK instruction because doing
14266 // so prevents folding a load into this instruction or making a copy.
14267 const int UnpackLoMask[] = {0, 0, 1, 1};
14268 const int UnpackHiMask[] = {2, 2, 3, 3};
14269 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14270 Mask = UnpackLoMask;
14271 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14272 Mask = UnpackHiMask;
14273
14274 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14275 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14276 }
14277
14278 if (Subtarget.hasAVX2())
14279 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14280 return Extract;
14281
14282 // Try to use shift instructions.
14283 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
14284 Zeroable, Subtarget, DAG))
14285 return Shift;
14286
14287 // There are special ways we can lower some single-element blends.
14288 if (NumV2Elements == 1)
14289 if (SDValue V = lowerShuffleAsElementInsertion(
14290 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14291 return V;
14292
14293 // We have different paths for blend lowering, but they all must use the
14294 // *exact* same predicate.
14295 bool IsBlendSupported = Subtarget.hasSSE41();
14296 if (IsBlendSupported)
14297 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14298 Zeroable, Subtarget, DAG))
14299 return Blend;
14300
14301 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14302 Zeroable, Subtarget, DAG))
14303 return Masked;
14304
14305 // Use dedicated unpack instructions for masks that match their pattern.
14306 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
14307 return V;
14308
14309 // Try to use byte rotation instructions.
14310 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14311 if (Subtarget.hasSSSE3()) {
14312 if (Subtarget.hasVLX())
14313 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14314 Subtarget, DAG))
14315 return Rotate;
14316
14317 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14318 Subtarget, DAG))
14319 return Rotate;
14320 }
14321
14322 // Assume that a single SHUFPS is faster than an alternative sequence of
14323 // multiple instructions (even if the CPU has a domain penalty).
14324 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14325 if (!isSingleSHUFPSMask(Mask)) {
14326 // If we have direct support for blends, we should lower by decomposing into
14327 // a permute. That will be faster than the domain cross.
14328 if (IsBlendSupported)
14329 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14330 Subtarget, DAG);
14331
14332 // Try to lower by permuting the inputs into an unpack instruction.
14333 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14334 Mask, Subtarget, DAG))
14335 return Unpack;
14336 }
14337
14338 // We implement this with SHUFPS because it can blend from two vectors.
14339 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14340 // up the inputs, bypassing domain shift penalties that we would incur if we
14341 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14342 // relevant.
14343 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14344 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14345 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14346 return DAG.getBitcast(MVT::v4i32, ShufPS);
14347}
14348
14349/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14350/// shuffle lowering, and the most complex part.
14351///
14352/// The lowering strategy is to try to form pairs of input lanes which are
14353/// targeted at the same half of the final vector, and then use a dword shuffle
14354/// to place them onto the right half, and finally unpack the paired lanes into
14355/// their final position.
14356///
14357/// The exact breakdown of how to form these dword pairs and align them on the
14358/// correct sides is really tricky. See the comments within the function for
14359/// more of the details.
14360///
14361/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14362/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14363/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14364/// vector, form the analogous 128-bit 8-element Mask.
14365static SDValue lowerV8I16GeneralSingleInputShuffle(
14366 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14367 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14368 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")((VT.getVectorElementType() == MVT::i16 && "Bad input type!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14368, __PRETTY_FUNCTION__))
;
14369 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14370
14371 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")((Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14371, __PRETTY_FUNCTION__))
;
14372 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14373 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14374
14375 // Attempt to directly match PSHUFLW or PSHUFHW.
14376 if (isUndefOrInRange(LoMask, 0, 4) &&
14377 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14378 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14379 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14380 }
14381 if (isUndefOrInRange(HiMask, 4, 8) &&
14382 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14383 for (int i = 0; i != 4; ++i)
14384 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14385 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14386 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14387 }
14388
14389 SmallVector<int, 4> LoInputs;
14390 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14391 array_pod_sort(LoInputs.begin(), LoInputs.end());
14392 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
14393 SmallVector<int, 4> HiInputs;
14394 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14395 array_pod_sort(HiInputs.begin(), HiInputs.end());
14396 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
14397 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14398 int NumHToL = LoInputs.size() - NumLToL;
14399 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14400 int NumHToH = HiInputs.size() - NumLToH;
14401 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14402 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14403 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14404 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14405
14406 // If we are shuffling values from one half - check how many different DWORD
14407 // pairs we need to create. If only 1 or 2 then we can perform this as a
14408 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14409 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14410 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14411 V = DAG.getNode(ShufWOp, DL, VT, V,
14412 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14413 V = DAG.getBitcast(PSHUFDVT, V);
14414 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14415 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14416 return DAG.getBitcast(VT, V);
14417 };
14418
14419 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14420 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14421 SmallVector<std::pair<int, int>, 4> DWordPairs;
14422 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14423
14424 // Collect the different DWORD pairs.
14425 for (int DWord = 0; DWord != 4; ++DWord) {
14426 int M0 = Mask[2 * DWord + 0];
14427 int M1 = Mask[2 * DWord + 1];
14428 M0 = (M0 >= 0 ? M0 % 4 : M0);
14429 M1 = (M1 >= 0 ? M1 % 4 : M1);
14430 if (M0 < 0 && M1 < 0)
14431 continue;
14432
14433 bool Match = false;
14434 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14435 auto &DWordPair = DWordPairs[j];
14436 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14437 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14438 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14439 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14440 PSHUFDMask[DWord] = DOffset + j;
14441 Match = true;
14442 break;
14443 }
14444 }
14445 if (!Match) {
14446 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14447 DWordPairs.push_back(std::make_pair(M0, M1));
14448 }
14449 }
14450
14451 if (DWordPairs.size() <= 2) {
14452 DWordPairs.resize(2, std::make_pair(-1, -1));
14453 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14454 DWordPairs[1].first, DWordPairs[1].second};
14455 if ((NumHToL + NumHToH) == 0)
14456 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14457 if ((NumLToL + NumLToH) == 0)
14458 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14459 }
14460 }
14461
14462 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14463 // such inputs we can swap two of the dwords across the half mark and end up
14464 // with <=2 inputs to each half in each half. Once there, we can fall through
14465 // to the generic code below. For example:
14466 //
14467 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14468 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14469 //
14470 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14471 // and an existing 2-into-2 on the other half. In this case we may have to
14472 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14473 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14474 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14475 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14476 // half than the one we target for fixing) will be fixed when we re-enter this
14477 // path. We will also combine away any sequence of PSHUFD instructions that
14478 // result into a single instruction. Here is an example of the tricky case:
14479 //
14480 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14481 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14482 //
14483 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14484 //
14485 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14486 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14487 //
14488 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14489 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14490 //
14491 // The result is fine to be handled by the generic logic.
14492 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14493 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14494 int AOffset, int BOffset) {
14495 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14496, __PRETTY_FUNCTION__))
14496 "Must call this with A having 3 or 1 inputs from the A half.")(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14496, __PRETTY_FUNCTION__))
;
14497 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14498, __PRETTY_FUNCTION__))
14498 "Must call this with B having 1 or 3 inputs from the B half.")(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14498, __PRETTY_FUNCTION__))
;
14499 assert(AToAInputs.size() + BToAInputs.size() == 4 &&((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14500, __PRETTY_FUNCTION__))
14500 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14500, __PRETTY_FUNCTION__))
;
14501
14502 bool ThreeAInputs = AToAInputs.size() == 3;
14503
14504 // Compute the index of dword with only one word among the three inputs in
14505 // a half by taking the sum of the half with three inputs and subtracting
14506 // the sum of the actual three inputs. The difference is the remaining
14507 // slot.
14508 int ADWord = 0, BDWord = 0;
14509 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14510 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14511 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14512 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14513 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14514 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14515 int TripleNonInputIdx =
14516 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14517 TripleDWord = TripleNonInputIdx / 2;
14518
14519 // We use xor with one to compute the adjacent DWord to whichever one the
14520 // OneInput is in.
14521 OneInputDWord = (OneInput / 2) ^ 1;
14522
14523 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14524 // and BToA inputs. If there is also such a problem with the BToB and AToB
14525 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14526 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14527 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14528 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14529 // Compute how many inputs will be flipped by swapping these DWords. We
14530 // need
14531 // to balance this to ensure we don't form a 3-1 shuffle in the other
14532 // half.
14533 int NumFlippedAToBInputs =
14534 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
14535 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
14536 int NumFlippedBToBInputs =
14537 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
14538 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
14539 if ((NumFlippedAToBInputs == 1 &&
14540 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14541 (NumFlippedBToBInputs == 1 &&
14542 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14543 // We choose whether to fix the A half or B half based on whether that
14544 // half has zero flipped inputs. At zero, we may not be able to fix it
14545 // with that half. We also bias towards fixing the B half because that
14546 // will more commonly be the high half, and we have to bias one way.
14547 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14548 ArrayRef<int> Inputs) {
14549 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14550 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14551 // Determine whether the free index is in the flipped dword or the
14552 // unflipped dword based on where the pinned index is. We use this bit
14553 // in an xor to conditionally select the adjacent dword.
14554 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14555 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14556 if (IsFixIdxInput == IsFixFreeIdxInput)
14557 FixFreeIdx += 1;
14558 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14559 assert(IsFixIdxInput != IsFixFreeIdxInput &&((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14560, __PRETTY_FUNCTION__))
14560 "We need to be changing the number of flipped inputs!")((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14560, __PRETTY_FUNCTION__))
;
14561 int PSHUFHalfMask[] = {0, 1, 2, 3};
14562 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14563 V = DAG.getNode(
14564 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14565 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14566 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14567
14568 for (int &M : Mask)
14569 if (M >= 0 && M == FixIdx)
14570 M = FixFreeIdx;
14571 else if (M >= 0 && M == FixFreeIdx)
14572 M = FixIdx;
14573 };
14574 if (NumFlippedBToBInputs != 0) {
14575 int BPinnedIdx =
14576 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14577 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14578 } else {
14579 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")((NumFlippedAToBInputs != 0 && "Impossible given predicates!"
) ? static_cast<void> (0) : __assert_fail ("NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14579, __PRETTY_FUNCTION__))
;
14580 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14581 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14582 }
14583 }
14584 }
14585
14586 int PSHUFDMask[] = {0, 1, 2, 3};
14587 PSHUFDMask[ADWord] = BDWord;
14588 PSHUFDMask[BDWord] = ADWord;
14589 V = DAG.getBitcast(
14590 VT,
14591 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14592 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14593
14594 // Adjust the mask to match the new locations of A and B.
14595 for (int &M : Mask)
14596 if (M >= 0 && M/2 == ADWord)
14597 M = 2 * BDWord + M % 2;
14598 else if (M >= 0 && M/2 == BDWord)
14599 M = 2 * ADWord + M % 2;
14600
14601 // Recurse back into this routine to re-compute state now that this isn't
14602 // a 3 and 1 problem.
14603 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14604 };
14605 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14606 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14607 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14608 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14609
14610 // At this point there are at most two inputs to the low and high halves from
14611 // each half. That means the inputs can always be grouped into dwords and
14612 // those dwords can then be moved to the correct half with a dword shuffle.
14613 // We use at most one low and one high word shuffle to collect these paired
14614 // inputs into dwords, and finally a dword shuffle to place them.
14615 int PSHUFLMask[4] = {-1, -1, -1, -1};
14616 int PSHUFHMask[4] = {-1, -1, -1, -1};
14617 int PSHUFDMask[4] = {-1, -1, -1, -1};
14618
14619 // First fix the masks for all the inputs that are staying in their
14620 // original halves. This will then dictate the targets of the cross-half
14621 // shuffles.
14622 auto fixInPlaceInputs =
14623 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14624 MutableArrayRef<int> SourceHalfMask,
14625 MutableArrayRef<int> HalfMask, int HalfOffset) {
14626 if (InPlaceInputs.empty())
14627 return;
14628 if (InPlaceInputs.size() == 1) {
14629 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14630 InPlaceInputs[0] - HalfOffset;
14631 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14632 return;
14633 }
14634 if (IncomingInputs.empty()) {
14635 // Just fix all of the in place inputs.
14636 for (int Input : InPlaceInputs) {
14637 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14638 PSHUFDMask[Input / 2] = Input / 2;
14639 }
14640 return;
14641 }
14642
14643 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")((InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"
) ? static_cast<void> (0) : __assert_fail ("InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14643, __PRETTY_FUNCTION__))
;
14644 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14645 InPlaceInputs[0] - HalfOffset;
14646 // Put the second input next to the first so that they are packed into
14647 // a dword. We find the adjacent index by toggling the low bit.
14648 int AdjIndex = InPlaceInputs[0] ^ 1;
14649 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14650 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
14651 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14652 };
14653 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14654 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14655
14656 // Now gather the cross-half inputs and place them into a free dword of
14657 // their target half.
14658 // FIXME: This operation could almost certainly be simplified dramatically to
14659 // look more like the 3-1 fixing operation.
14660 auto moveInputsToRightHalf = [&PSHUFDMask](
14661 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14662 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14663 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14664 int DestOffset) {
14665 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14666 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14667 };
14668 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14669 int Word) {
14670 int LowWord = Word & ~1;
14671 int HighWord = Word | 1;
14672 return isWordClobbered(SourceHalfMask, LowWord) ||
14673 isWordClobbered(SourceHalfMask, HighWord);
14674 };
14675
14676 if (IncomingInputs.empty())
14677 return;
14678
14679 if (ExistingInputs.empty()) {
14680 // Map any dwords with inputs from them into the right half.
14681 for (int Input : IncomingInputs) {
14682 // If the source half mask maps over the inputs, turn those into
14683 // swaps and use the swapped lane.
14684 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14685 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14686 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14687 Input - SourceOffset;
14688 // We have to swap the uses in our half mask in one sweep.
14689 for (int &M : HalfMask)
14690 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14691 M = Input;
14692 else if (M == Input)
14693 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14694 } else {
14695 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14697, __PRETTY_FUNCTION__))
14696 Input - SourceOffset &&((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14697, __PRETTY_FUNCTION__))
14697 "Previous placement doesn't match!")((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14697, __PRETTY_FUNCTION__))
;
14698 }
14699 // Note that this correctly re-maps both when we do a swap and when
14700 // we observe the other side of the swap above. We rely on that to
14701 // avoid swapping the members of the input list directly.
14702 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14703 }
14704
14705 // Map the input's dword into the correct half.
14706 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14707 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14708 else
14709 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14711, __PRETTY_FUNCTION__))
14710 Input / 2 &&((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14711, __PRETTY_FUNCTION__))
14711 "Previous placement doesn't match!")((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14711, __PRETTY_FUNCTION__))
;
14712 }
14713
14714 // And just directly shift any other-half mask elements to be same-half
14715 // as we will have mirrored the dword containing the element into the
14716 // same position within that half.
14717 for (int &M : HalfMask)
14718 if (M >= SourceOffset && M < SourceOffset + 4) {
14719 M = M - SourceOffset + DestOffset;
14720 assert(M >= 0 && "This should never wrap below zero!")((M >= 0 && "This should never wrap below zero!") ?
static_cast<void> (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14720, __PRETTY_FUNCTION__))
;
14721 }
14722 return;
14723 }
14724
14725 // Ensure we have the input in a viable dword of its current half. This
14726 // is particularly tricky because the original position may be clobbered
14727 // by inputs being moved and *staying* in that half.
14728 if (IncomingInputs.size() == 1) {
14729 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14730 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14731 SourceOffset;
14732 SourceHalfMask[InputFixed - SourceOffset] =
14733 IncomingInputs[0] - SourceOffset;
14734 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
14735 InputFixed);
14736 IncomingInputs[0] = InputFixed;
14737 }
14738 } else if (IncomingInputs.size() == 2) {
14739 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14740 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14741 // We have two non-adjacent or clobbered inputs we need to extract from
14742 // the source half. To do this, we need to map them into some adjacent
14743 // dword slot in the source mask.
14744 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14745 IncomingInputs[1] - SourceOffset};
14746
14747 // If there is a free slot in the source half mask adjacent to one of
14748 // the inputs, place the other input in it. We use (Index XOR 1) to
14749 // compute an adjacent index.
14750 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14751 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14752 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14753 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14754 InputsFixed[1] = InputsFixed[0] ^ 1;
14755 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14756 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14757 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14758 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14759 InputsFixed[0] = InputsFixed[1] ^ 1;
14760 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14761 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14762 // The two inputs are in the same DWord but it is clobbered and the
14763 // adjacent DWord isn't used at all. Move both inputs to the free
14764 // slot.
14765 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14766 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14767 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14768 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14769 } else {
14770 // The only way we hit this point is if there is no clobbering
14771 // (because there are no off-half inputs to this half) and there is no
14772 // free slot adjacent to one of the inputs. In this case, we have to
14773 // swap an input with a non-input.
14774 for (int i = 0; i < 4; ++i)
14775 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14776, __PRETTY_FUNCTION__))
14776 "We can't handle any clobbers here!")(((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14776, __PRETTY_FUNCTION__))
;
14777 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14778, __PRETTY_FUNCTION__))
14778 "Cannot have adjacent inputs here!")((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14778, __PRETTY_FUNCTION__))
;
14779
14780 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14781 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14782
14783 // We also have to update the final source mask in this case because
14784 // it may need to undo the above swap.
14785 for (int &M : FinalSourceHalfMask)
14786 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14787 M = InputsFixed[1] + SourceOffset;
14788 else if (M == InputsFixed[1] + SourceOffset)
14789 M = (InputsFixed[0] ^ 1) + SourceOffset;
14790
14791 InputsFixed[1] = InputsFixed[0] ^ 1;
14792 }
14793
14794 // Point everything at the fixed inputs.
14795 for (int &M : HalfMask)
14796 if (M == IncomingInputs[0])
14797 M = InputsFixed[0] + SourceOffset;
14798 else if (M == IncomingInputs[1])
14799 M = InputsFixed[1] + SourceOffset;
14800
14801 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14802 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14803 }
14804 } else {
14805 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14805)
;
14806 }
14807
14808 // Now hoist the DWord down to the right half.
14809 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14810 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")((PSHUFDMask[FreeDWord] < 0 && "DWord not free") ?
static_cast<void> (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14810, __PRETTY_FUNCTION__))
;
14811 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14812 for (int &M : HalfMask)
14813 for (int Input : IncomingInputs)
14814 if (M == Input)
14815 M = FreeDWord * 2 + Input % 2;
14816 };
14817 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14818 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14819 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14820 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14821
14822 // Now enact all the shuffles we've computed to move the inputs into their
14823 // target half.
14824 if (!isNoopShuffleMask(PSHUFLMask))
14825 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14826 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14827 if (!isNoopShuffleMask(PSHUFHMask))
14828 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14829 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14830 if (!isNoopShuffleMask(PSHUFDMask))
14831 V = DAG.getBitcast(
14832 VT,
14833 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14834 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14835
14836 // At this point, each half should contain all its inputs, and we can then
14837 // just shuffle them into their final position.
14838 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&((count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!") ?
static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14839, __PRETTY_FUNCTION__))
14839 "Failed to lift all the high half inputs to the low mask!")((count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!") ?
static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14839, __PRETTY_FUNCTION__))
;
14840 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&((count_if(HiMask, [](int M) { return M >= 0 && M <
4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14841, __PRETTY_FUNCTION__))
14841 "Failed to lift all the low half inputs to the high mask!")((count_if(HiMask, [](int M) { return M >= 0 && M <
4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14841, __PRETTY_FUNCTION__))
;
14842
14843 // Do a half shuffle for the low mask.
14844 if (!isNoopShuffleMask(LoMask))
14845 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14846 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14847
14848 // Do a half shuffle with the high mask after shifting its values down.
14849 for (int &M : HiMask)
14850 if (M >= 0)
14851 M -= 4;
14852 if (!isNoopShuffleMask(HiMask))
14853 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14854 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14855
14856 return V;
14857}
14858
14859/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14860/// blend if only one input is used.
14861static SDValue lowerShuffleAsBlendOfPSHUFBs(
14862 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14863 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14864 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&((!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14865, __PRETTY_FUNCTION__))
14865 "Lane crossing shuffle masks not supported")((!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14865, __PRETTY_FUNCTION__))
;
14866
14867 int NumBytes = VT.getSizeInBits() / 8;
14868 int Size = Mask.size();
14869 int Scale = NumBytes / Size;
14870
14871 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14872 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14873 V1InUse = false;
14874 V2InUse = false;
14875
14876 for (int i = 0; i < NumBytes; ++i) {
14877 int M = Mask[i / Scale];
14878 if (M < 0)
14879 continue;
14880
14881 const int ZeroMask = 0x80;
14882 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14883 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14884 if (Zeroable[i / Scale])
14885 V1Idx = V2Idx = ZeroMask;
14886
14887 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14888 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14889 V1InUse |= (ZeroMask != V1Idx);
14890 V2InUse |= (ZeroMask != V2Idx);
14891 }
14892
14893 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14894 if (V1InUse)
14895 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14896 DAG.getBuildVector(ShufVT, DL, V1Mask));
14897 if (V2InUse)
14898 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14899 DAG.getBuildVector(ShufVT, DL, V2Mask));
14900
14901 // If we need shuffled inputs from both, blend the two.
14902 SDValue V;
14903 if (V1InUse && V2InUse)
14904 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14905 else
14906 V = V1InUse ? V1 : V2;
14907
14908 // Cast the result back to the correct type.
14909 return DAG.getBitcast(VT, V);
14910}
14911
14912/// Generic lowering of 8-lane i16 shuffles.
14913///
14914/// This handles both single-input shuffles and combined shuffle/blends with
14915/// two inputs. The single input shuffles are immediately delegated to
14916/// a dedicated lowering routine.
14917///
14918/// The blends are lowered in one of three fundamental ways. If there are few
14919/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14920/// of the input is significantly cheaper when lowered as an interleaving of
14921/// the two inputs, try to interleave them. Otherwise, blend the low and high
14922/// halves of the inputs separately (making them have relatively few inputs)
14923/// and then concatenate them.
14924static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14925 const APInt &Zeroable, SDValue V1, SDValue V2,
14926 const X86Subtarget &Subtarget,
14927 SelectionDAG &DAG) {
14928 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14928, __PRETTY_FUNCTION__))
;
14929 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14929, __PRETTY_FUNCTION__))
;
14930 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14930, __PRETTY_FUNCTION__))
;
14931
14932 // Whenever we can lower this as a zext, that instruction is strictly faster
14933 // than any alternative.
14934 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14935 Zeroable, Subtarget, DAG))
14936 return ZExt;
14937
14938 // Try to use lower using a truncation.
14939 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14940 Subtarget, DAG))
14941 return V;
14942
14943 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14944
14945 if (NumV2Inputs == 0) {
14946 // Try to use shift instructions.
14947 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
14948 Zeroable, Subtarget, DAG))
14949 return Shift;
14950
14951 // Check for being able to broadcast a single element.
14952 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14953 Mask, Subtarget, DAG))
14954 return Broadcast;
14955
14956 // Try to use bit rotation instructions.
14957 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14958 Subtarget, DAG))
14959 return Rotate;
14960
14961 // Use dedicated unpack instructions for masks that match their pattern.
14962 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
14963 return V;
14964
14965 // Use dedicated pack instructions for masks that match their pattern.
14966 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
14967 Subtarget))
14968 return V;
14969
14970 // Try to use byte rotation instructions.
14971 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14972 Subtarget, DAG))
14973 return Rotate;
14974
14975 // Make a copy of the mask so it can be modified.
14976 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
14977 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14978 Subtarget, DAG);
14979 }
14980
14981 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14983, __PRETTY_FUNCTION__))
14982 "All single-input shuffles should be canonicalized to be V1-input "((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14983, __PRETTY_FUNCTION__))
14983 "shuffles.")((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14983, __PRETTY_FUNCTION__))
;
14984
14985 // Try to use shift instructions.
14986 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
14987 Zeroable, Subtarget, DAG))
14988 return Shift;
14989
14990 // See if we can use SSE4A Extraction / Insertion.
14991 if (Subtarget.hasSSE4A())
14992 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14993 Zeroable, DAG))
14994 return V;
14995
14996 // There are special ways we can lower some single-element blends.
14997 if (NumV2Inputs == 1)
14998 if (SDValue V = lowerShuffleAsElementInsertion(
14999 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15000 return V;
15001
15002 // We have different paths for blend lowering, but they all must use the
15003 // *exact* same predicate.
15004 bool IsBlendSupported = Subtarget.hasSSE41();
15005 if (IsBlendSupported)
15006 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15007 Zeroable, Subtarget, DAG))
15008 return Blend;
15009
15010 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15011 Zeroable, Subtarget, DAG))
15012 return Masked;
15013
15014 // Use dedicated unpack instructions for masks that match their pattern.
15015 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15016 return V;
15017
15018 // Use dedicated pack instructions for masks that match their pattern.
15019 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15020 Subtarget))
15021 return V;
15022
15023 // Try to use lower using a truncation.
15024 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15025 Subtarget, DAG))
15026 return V;
15027
15028 // Try to use byte rotation instructions.
15029 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15030 Subtarget, DAG))
15031 return Rotate;
15032
15033 if (SDValue BitBlend =
15034 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15035 return BitBlend;
15036
15037 // Try to use byte shift instructions to mask.
15038 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15039 Zeroable, Subtarget, DAG))
15040 return V;
15041
15042 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15043 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15044 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15045 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
15046 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15047 !Subtarget.hasVLX()) {
15048 SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
15049 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15050 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15051 SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15052 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15053 DWordClearMask);
15054 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15055 DWordClearMask);
15056 // Now pack things back together.
15057 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15058 if (NumEvenDrops == 2) {
15059 Result = DAG.getBitcast(MVT::v4i32, Result);
15060 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15061 }
15062 return Result;
15063 }
15064
15065 // Try to lower by permuting the inputs into an unpack instruction.
15066 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15067 Mask, Subtarget, DAG))
15068 return Unpack;
15069
15070 // If we can't directly blend but can use PSHUFB, that will be better as it
15071 // can both shuffle and set up the inefficient blend.
15072 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15073 bool V1InUse, V2InUse;
15074 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15075 Zeroable, DAG, V1InUse, V2InUse);
15076 }
15077
15078 // We can always bit-blend if we have to so the fallback strategy is to
15079 // decompose into single-input permutes and blends/unpacks.
15080 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15081 Mask, Subtarget, DAG);
15082}
15083
15084// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15085// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15086// the active subvector is extracted.
15087static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15088 ArrayRef<int> Mask, SDValue V1, SDValue V2,
15089 const X86Subtarget &Subtarget,
15090 SelectionDAG &DAG) {
15091 MVT MaskVT = VT.changeTypeToInteger();
15092 SDValue MaskNode;
15093 MVT ShuffleVT = VT;
15094 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15095 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15096 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15097 ShuffleVT = V1.getSimpleValueType();
15098
15099 // Adjust mask to correct indices for the second input.
15100 int NumElts = VT.getVectorNumElements();
15101 unsigned Scale = 512 / VT.getSizeInBits();
15102 SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15103 for (int &M : AdjustedMask)
15104 if (NumElts <= M)
15105 M += (Scale - 1) * NumElts;
15106 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15107 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15108 } else {
15109 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15110 }
15111
15112 SDValue Result;
15113 if (V2.isUndef())
15114 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15115 else
15116 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15117
15118 if (VT != ShuffleVT)
15119 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15120
15121 return Result;
15122}
15123
15124/// Generic lowering of v16i8 shuffles.
15125///
15126/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15127/// detect any complexity reducing interleaving. If that doesn't help, it uses
15128/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15129/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15130/// back together.
15131static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15132 const APInt &Zeroable, SDValue V1, SDValue V2,
15133 const X86Subtarget &Subtarget,
15134 SelectionDAG &DAG) {
15135 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15135, __PRETTY_FUNCTION__))
;
15136 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15136, __PRETTY_FUNCTION__))
;
15137 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15137, __PRETTY_FUNCTION__))
;
15138
15139 // Try to use shift instructions.
15140 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15141 Zeroable, Subtarget, DAG))
15142 return Shift;
15143
15144 // Try to use byte rotation instructions.
15145 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15146 Subtarget, DAG))
15147 return Rotate;
15148
15149 // Use dedicated pack instructions for masks that match their pattern.
15150 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15151 Subtarget))
15152 return V;
15153
15154 // Try to use a zext lowering.
15155 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15156 Zeroable, Subtarget, DAG))
15157 return ZExt;
15158
15159 // Try to use lower using a truncation.
15160 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15161 Subtarget, DAG))
15162 return V;
15163
15164 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15165 Subtarget, DAG))
15166 return V;
15167
15168 // See if we can use SSE4A Extraction / Insertion.
15169 if (Subtarget.hasSSE4A())
15170 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15171 Zeroable, DAG))
15172 return V;
15173
15174 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15175
15176 // For single-input shuffles, there are some nicer lowering tricks we can use.
15177 if (NumV2Elements == 0) {
15178 // Check for being able to broadcast a single element.
15179 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15180 Mask, Subtarget, DAG))
15181 return Broadcast;
15182
15183 // Try to use bit rotation instructions.
15184 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15185 Subtarget, DAG))
15186 return Rotate;
15187
15188 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15189 return V;
15190
15191 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15192 // Notably, this handles splat and partial-splat shuffles more efficiently.
15193 // However, it only makes sense if the pre-duplication shuffle simplifies
15194 // things significantly. Currently, this means we need to be able to
15195 // express the pre-duplication shuffle as an i16 shuffle.
15196 //
15197 // FIXME: We should check for other patterns which can be widened into an
15198 // i16 shuffle as well.
15199 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15200 for (int i = 0; i < 16; i += 2)
15201 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15202 return false;
15203
15204 return true;
15205 };
15206 auto tryToWidenViaDuplication = [&]() -> SDValue {
15207 if (!canWidenViaDuplication(Mask))
15208 return SDValue();
15209 SmallVector<int, 4> LoInputs;
15210 copy_if(Mask, std::back_inserter(LoInputs),
15211 [](int M) { return M >= 0 && M < 8; });
15212 array_pod_sort(LoInputs.begin(), LoInputs.end());
15213 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
15214 LoInputs.end());
15215 SmallVector<int, 4> HiInputs;
15216 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15217 array_pod_sort(HiInputs.begin(), HiInputs.end());
15218 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
15219 HiInputs.end());
15220
15221 bool TargetLo = LoInputs.size() >= HiInputs.size();
15222 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15223 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15224
15225 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15226 SmallDenseMap<int, int, 8> LaneMap;
15227 for (int I : InPlaceInputs) {
15228 PreDupI16Shuffle[I/2] = I/2;
15229 LaneMap[I] = I;
15230 }
15231 int j = TargetLo ? 0 : 4, je = j + 4;
15232 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15233 // Check if j is already a shuffle of this input. This happens when
15234 // there are two adjacent bytes after we move the low one.
15235 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15236 // If we haven't yet mapped the input, search for a slot into which
15237 // we can map it.
15238 while (j < je && PreDupI16Shuffle[j] >= 0)
15239 ++j;
15240
15241 if (j == je)
15242 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15243 return SDValue();
15244
15245 // Map this input with the i16 shuffle.
15246 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15247 }
15248
15249 // Update the lane map based on the mapping we ended up with.
15250 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15251 }
15252 V1 = DAG.getBitcast(
15253 MVT::v16i8,
15254 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15255 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15256
15257 // Unpack the bytes to form the i16s that will be shuffled into place.
15258 bool EvenInUse = false, OddInUse = false;
15259 for (int i = 0; i < 16; i += 2) {
15260 EvenInUse |= (Mask[i + 0] >= 0);
15261 OddInUse |= (Mask[i + 1] >= 0);
15262 if (EvenInUse && OddInUse)
15263 break;
15264 }
15265 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15266 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15267 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15268
15269 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15270 for (int i = 0; i < 16; ++i)
15271 if (Mask[i] >= 0) {
15272 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15273 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")((MappedMask < 8 && "Invalid v8 shuffle mask!") ? static_cast
<void> (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15273, __PRETTY_FUNCTION__))
;
15274 if (PostDupI16Shuffle[i / 2] < 0)
15275 PostDupI16Shuffle[i / 2] = MappedMask;
15276 else
15277 assert(PostDupI16Shuffle[i / 2] == MappedMask &&((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15278, __PRETTY_FUNCTION__))
15278 "Conflicting entries in the original shuffle!")((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15278, __PRETTY_FUNCTION__))
;
15279 }
15280 return DAG.getBitcast(
15281 MVT::v16i8,
15282 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15283 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15284 };
15285 if (SDValue V = tryToWidenViaDuplication())
15286 return V;
15287 }
15288
15289 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15290 Zeroable, Subtarget, DAG))
15291 return Masked;
15292
15293 // Use dedicated unpack instructions for masks that match their pattern.
15294 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15295 return V;
15296
15297 // Try to use byte shift instructions to mask.
15298 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15299 Zeroable, Subtarget, DAG))
15300 return V;
15301
15302 // Check for compaction patterns.
15303 bool IsSingleInput = V2.isUndef();
15304 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
15305
15306 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15307 // with PSHUFB. It is important to do this before we attempt to generate any
15308 // blends but after all of the single-input lowerings. If the single input
15309 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15310 // want to preserve that and we can DAG combine any longer sequences into
15311 // a PSHUFB in the end. But once we start blending from multiple inputs,
15312 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15313 // and there are *very* few patterns that would actually be faster than the
15314 // PSHUFB approach because of its ability to zero lanes.
15315 //
15316 // If the mask is a binary compaction, we can more efficiently perform this
15317 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15318 //
15319 // FIXME: The only exceptions to the above are blends which are exact
15320 // interleavings with direct instructions supporting them. We currently don't
15321 // handle those well here.
15322 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15323 bool V1InUse = false;
15324 bool V2InUse = false;
15325
15326 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
15327 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15328
15329 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15330 // do so. This avoids using them to handle blends-with-zero which is
15331 // important as a single pshufb is significantly faster for that.
15332 if (V1InUse && V2InUse) {
15333 if (Subtarget.hasSSE41())
15334 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15335 Zeroable, Subtarget, DAG))
15336 return Blend;
15337
15338 // We can use an unpack to do the blending rather than an or in some
15339 // cases. Even though the or may be (very minorly) more efficient, we
15340 // preference this lowering because there are common cases where part of
15341 // the complexity of the shuffles goes away when we do the final blend as
15342 // an unpack.
15343 // FIXME: It might be worth trying to detect if the unpack-feeding
15344 // shuffles will both be pshufb, in which case we shouldn't bother with
15345 // this.
15346 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
15347 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15348 return Unpack;
15349
15350 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15351 if (Subtarget.hasVBMI())
15352 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15353 DAG);
15354
15355 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15356 if (Subtarget.hasXOP()) {
15357 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15358 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15359 }
15360
15361 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15362 // PALIGNR will be cheaper than the second PSHUFB+OR.
15363 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
15364 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15365 return V;
15366 }
15367
15368 return PSHUFB;
15369 }
15370
15371 // There are special ways we can lower some single-element blends.
15372 if (NumV2Elements == 1)
15373 if (SDValue V = lowerShuffleAsElementInsertion(
15374 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15375 return V;
15376
15377 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15378 return Blend;
15379
15380 // Check whether a compaction lowering can be done. This handles shuffles
15381 // which take every Nth element for some even N. See the helper function for
15382 // details.
15383 //
15384 // We special case these as they can be particularly efficiently handled with
15385 // the PACKUSB instruction on x86 and they show up in common patterns of
15386 // rearranging bytes to truncate wide elements.
15387 if (NumEvenDrops) {
15388 // NumEvenDrops is the power of two stride of the elements. Another way of
15389 // thinking about it is that we need to drop the even elements this many
15390 // times to get the original input.
15391
15392 // First we need to zero all the dropped bytes.
15393 assert(NumEvenDrops <= 3 &&((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15394, __PRETTY_FUNCTION__))
15394 "No support for dropping even elements more than 3 times.")((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15394, __PRETTY_FUNCTION__))
;
15395 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15396 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15397 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15398 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15399 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15400 WordClearMask);
15401 if (!IsSingleInput)
15402 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15403 WordClearMask);
15404
15405 // Now pack things back together.
15406 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15407 IsSingleInput ? V1 : V2);
15408 for (int i = 1; i < NumEvenDrops; ++i) {
15409 Result = DAG.getBitcast(MVT::v8i16, Result);
15410 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15411 }
15412 return Result;
15413 }
15414
15415 // Handle multi-input cases by blending/unpacking single-input shuffles.
15416 if (NumV2Elements > 0)
15417 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15418 Subtarget, DAG);
15419
15420 // The fallback path for single-input shuffles widens this into two v8i16
15421 // vectors with unpacks, shuffles those, and then pulls them back together
15422 // with a pack.
15423 SDValue V = V1;
15424
15425 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15426 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15427 for (int i = 0; i < 16; ++i)
15428 if (Mask[i] >= 0)
15429 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15430
15431 SDValue VLoHalf, VHiHalf;
15432 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15433 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15434 // i16s.
15435 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15436 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15437 // Use a mask to drop the high bytes.
15438 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15439 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15440 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15441
15442 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15443 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15444
15445 // Squash the masks to point directly into VLoHalf.
15446 for (int &M : LoBlendMask)
15447 if (M >= 0)
15448 M /= 2;
15449 for (int &M : HiBlendMask)
15450 if (M >= 0)
15451 M /= 2;
15452 } else {
15453 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15454 // VHiHalf so that we can blend them as i16s.
15455 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15456
15457 VLoHalf = DAG.getBitcast(
15458 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15459 VHiHalf = DAG.getBitcast(
15460 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15461 }
15462
15463 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15464 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15465
15466 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15467}
15468
15469/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15470///
15471/// This routine breaks down the specific type of 128-bit shuffle and
15472/// dispatches to the lowering routines accordingly.
15473static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15474 MVT VT, SDValue V1, SDValue V2,
15475 const APInt &Zeroable,
15476 const X86Subtarget &Subtarget,
15477 SelectionDAG &DAG) {
15478 switch (VT.SimpleTy) {
15479 case MVT::v2i64:
15480 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15481 case MVT::v2f64:
15482 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15483 case MVT::v4i32:
15484 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15485 case MVT::v4f32:
15486 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15487 case MVT::v8i16:
15488 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15489 case MVT::v16i8:
15490 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15491
15492 default:
15493 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15493)
;
15494 }
15495}
15496
15497/// Generic routine to split vector shuffle into half-sized shuffles.
15498///
15499/// This routine just extracts two subvectors, shuffles them independently, and
15500/// then concatenates them back together. This should work effectively with all
15501/// AVX vector shuffle types.
15502static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
15503 SDValue V2, ArrayRef<int> Mask,
15504 SelectionDAG &DAG) {
15505 assert(VT.getSizeInBits() >= 256 &&((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15506, __PRETTY_FUNCTION__))
15506 "Only for 256-bit or wider vector shuffles!")((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15506, __PRETTY_FUNCTION__))
;
15507 assert(V1.getSimpleValueType() == VT && "Bad operand type!")((V1.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15507, __PRETTY_FUNCTION__))
;
15508 assert(V2.getSimpleValueType() == VT && "Bad operand type!")((V2.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15508, __PRETTY_FUNCTION__))
;
15509
15510 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15511 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15512
15513 int NumElements = VT.getVectorNumElements();
15514 int SplitNumElements = NumElements / 2;
15515 MVT ScalarVT = VT.getVectorElementType();
15516 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15517
15518 // Use splitVector/extractSubVector so that split build-vectors just build two
15519 // narrower build vectors. This helps shuffling with splats and zeros.
15520 auto SplitVector = [&](SDValue V) {
15521 SDValue LoV, HiV;
15522 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15523 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15524 DAG.getBitcast(SplitVT, HiV));
15525 };
15526
15527 SDValue LoV1, HiV1, LoV2, HiV2;
15528 std::tie(LoV1, HiV1) = SplitVector(V1);
15529 std::tie(LoV2, HiV2) = SplitVector(V2);
15530
15531 // Now create two 4-way blends of these half-width vectors.
15532 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15533 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
15534 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15535 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15536 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15537 for (int i = 0; i < SplitNumElements; ++i) {
15538 int M = HalfMask[i];
15539 if (M >= NumElements) {
15540 if (M >= NumElements + SplitNumElements)
15541 UseHiV2 = true;
15542 else
15543 UseLoV2 = true;
15544 V2BlendMask[i] = M - NumElements;
15545 BlendMask[i] = SplitNumElements + i;
15546 } else if (M >= 0) {
15547 if (M >= SplitNumElements)
15548 UseHiV1 = true;
15549 else
15550 UseLoV1 = true;
15551 V1BlendMask[i] = M;
15552 BlendMask[i] = i;
15553 }
15554 }
15555
15556 // Because the lowering happens after all combining takes place, we need to
15557 // manually combine these blend masks as much as possible so that we create
15558 // a minimal number of high-level vector shuffle nodes.
15559
15560 // First try just blending the halves of V1 or V2.
15561 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15562 return DAG.getUNDEF(SplitVT);
15563 if (!UseLoV2 && !UseHiV2)
15564 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15565 if (!UseLoV1 && !UseHiV1)
15566 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15567
15568 SDValue V1Blend, V2Blend;
15569 if (UseLoV1 && UseHiV1) {
15570 V1Blend =
15571 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15572 } else {
15573 // We only use half of V1 so map the usage down into the final blend mask.
15574 V1Blend = UseLoV1 ? LoV1 : HiV1;
15575 for (int i = 0; i < SplitNumElements; ++i)
15576 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15577 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15578 }
15579 if (UseLoV2 && UseHiV2) {
15580 V2Blend =
15581 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15582 } else {
15583 // We only use half of V2 so map the usage down into the final blend mask.
15584 V2Blend = UseLoV2 ? LoV2 : HiV2;
15585 for (int i = 0; i < SplitNumElements; ++i)
15586 if (BlendMask[i] >= SplitNumElements)
15587 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15588 }
15589 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15590 };
15591 SDValue Lo = HalfBlend(LoMask);
15592 SDValue Hi = HalfBlend(HiMask);
15593 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15594}
15595
15596/// Either split a vector in halves or decompose the shuffles and the
15597/// blend/unpack.
15598///
15599/// This is provided as a good fallback for many lowerings of non-single-input
15600/// shuffles with more than one 128-bit lane. In those cases, we want to select
15601/// between splitting the shuffle into 128-bit components and stitching those
15602/// back together vs. extracting the single-input shuffles and blending those
15603/// results.
15604static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
15605 SDValue V2, ArrayRef<int> Mask,
15606 const X86Subtarget &Subtarget,
15607 SelectionDAG &DAG) {
15608 assert(!V2.isUndef() && "This routine must not be used to lower single-input "((!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? static_cast
<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15609, __PRETTY_FUNCTION__))
15609 "shuffles as it could then recurse on itself.")((!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? static_cast
<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15609, __PRETTY_FUNCTION__))
;
15610 int Size = Mask.size();
15611
15612 // If this can be modeled as a broadcast of two elements followed by a blend,
15613 // prefer that lowering. This is especially important because broadcasts can
15614 // often fold with memory operands.
15615 auto DoBothBroadcast = [&] {
15616 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15617 for (int M : Mask)
15618 if (M >= Size) {
15619 if (V2BroadcastIdx < 0)
15620 V2BroadcastIdx = M - Size;
15621 else if (M - Size != V2BroadcastIdx)
15622 return false;
15623 } else if (M >= 0) {
15624 if (V1BroadcastIdx < 0)
15625 V1BroadcastIdx = M;
15626 else if (M != V1BroadcastIdx)
15627 return false;
15628 }
15629 return true;
15630 };
15631 if (DoBothBroadcast())
15632 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15633 DAG);
15634
15635 // If the inputs all stem from a single 128-bit lane of each input, then we
15636 // split them rather than blending because the split will decompose to
15637 // unusually few instructions.
15638 int LaneCount = VT.getSizeInBits() / 128;
15639 int LaneSize = Size / LaneCount;
15640 SmallBitVector LaneInputs[2];
15641 LaneInputs[0].resize(LaneCount, false);
15642 LaneInputs[1].resize(LaneCount, false);
15643 for (int i = 0; i < Size; ++i)
15644 if (Mask[i] >= 0)
15645 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15646 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15647 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15648
15649 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15650 // requires that the decomposed single-input shuffles don't end up here.
15651 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15652 DAG);
15653}
15654
15655// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15656// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15657static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
15658 SDValue V1, SDValue V2,
15659 ArrayRef<int> Mask,
15660 SelectionDAG &DAG) {
15661 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")((VT == MVT::v4f64 && "Only for v4f64 shuffles") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15661, __PRETTY_FUNCTION__))
;
15662
15663 int LHSMask[4] = {-1, -1, -1, -1};
15664 int RHSMask[4] = {-1, -1, -1, -1};
15665 unsigned SHUFPMask = 0;
15666
15667 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15668 // perform the shuffle once the lanes have been shuffled in place.
15669 for (int i = 0; i != 4; ++i) {
15670 int M = Mask[i];
15671 if (M < 0)
15672 continue;
15673 int LaneBase = i & ~1;
15674 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15675 LaneMask[LaneBase + (M & 1)] = M;
15676 SHUFPMask |= (M & 1) << i;
15677 }
15678
15679 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15680 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15681 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15682 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
15683}
15684
15685/// Lower a vector shuffle crossing multiple 128-bit lanes as
15686/// a lane permutation followed by a per-lane permutation.
15687///
15688/// This is mainly for cases where we can have non-repeating permutes
15689/// in each lane.
15690///
15691/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15692/// we should investigate merging them.
15693static SDValue lowerShuffleAsLanePermuteAndPermute(
15694 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15695 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15696 int NumElts = VT.getVectorNumElements();
15697 int NumLanes = VT.getSizeInBits() / 128;
15698 int NumEltsPerLane = NumElts / NumLanes;
15699 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15700
15701 /// Attempts to find a sublane permute with the given size
15702 /// that gets all elements into their target lanes.
15703 ///
15704 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15705 /// If unsuccessful, returns false and may overwrite InLaneMask.
15706 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15707 int NumSublanesPerLane = NumSublanes / NumLanes;
15708 int NumEltsPerSublane = NumElts / NumSublanes;
15709
15710 SmallVector<int, 16> CrossLaneMask;
15711 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15712 // CrossLaneMask but one entry == one sublane.
15713 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15714
15715 for (int i = 0; i != NumElts; ++i) {
15716 int M = Mask[i];
15717 if (M < 0)
15718 continue;
15719
15720 int SrcSublane = M / NumEltsPerSublane;
15721 int DstLane = i / NumEltsPerLane;
15722
15723 // We only need to get the elements into the right lane, not sublane.
15724 // So search all sublanes that make up the destination lane.
15725 bool Found = false;
15726 int DstSubStart = DstLane * NumSublanesPerLane;
15727 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15728 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15729 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15730 continue;
15731
15732 Found = true;
15733 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15734 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15735 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15736 break;
15737 }
15738 if (!Found)
15739 return SDValue();
15740 }
15741
15742 // Fill CrossLaneMask using CrossLaneMaskLarge.
15743 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15744
15745 if (!CanUseSublanes) {
15746 // If we're only shuffling a single lowest lane and the rest are identity
15747 // then don't bother.
15748 // TODO - isShuffleMaskInputInPlace could be extended to something like
15749 // this.
15750 int NumIdentityLanes = 0;
15751 bool OnlyShuffleLowestLane = true;
15752 for (int i = 0; i != NumLanes; ++i) {
15753 int LaneOffset = i * NumEltsPerLane;
15754 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15755 i * NumEltsPerLane))
15756 NumIdentityLanes++;
15757 else if (CrossLaneMask[LaneOffset] != 0)
15758 OnlyShuffleLowestLane = false;
15759 }
15760 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15761 return SDValue();
15762 }
15763
15764 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15765 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15766 InLaneMask);
15767 };
15768
15769 // First attempt a solution with full lanes.
15770 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15771 return V;
15772
15773 // The rest of the solutions use sublanes.
15774 if (!CanUseSublanes)
15775 return SDValue();
15776
15777 // Then attempt a solution with 64-bit sublanes (vpermq).
15778 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15779 return V;
15780
15781 // If that doesn't work and we have fast variable shuffle,
15782 // attempt 32-bit sublanes (vpermd).
15783 if (!Subtarget.hasFastVariableShuffle())
15784 return SDValue();
15785
15786 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15787}
15788
15789/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15790/// source with a lane permutation.
15791///
15792/// This lowering strategy results in four instructions in the worst case for a
15793/// single-input cross lane shuffle which is lower than any other fully general
15794/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15795/// shuffle pattern should be handled prior to trying this lowering.
15796static SDValue lowerShuffleAsLanePermuteAndShuffle(
15797 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15798 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15799 // FIXME: This should probably be generalized for 512-bit vectors as well.
15800 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")((VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15800, __PRETTY_FUNCTION__))
;
15801 int Size = Mask.size();
15802 int LaneSize = Size / 2;
15803
15804 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15805 // Only do this if the elements aren't all from the lower lane,
15806 // otherwise we're (probably) better off doing a split.
15807 if (VT == MVT::v4f64 &&
15808 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15809 if (SDValue V =
15810 lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
15811 return V;
15812
15813 // If there are only inputs from one 128-bit lane, splitting will in fact be
15814 // less expensive. The flags track whether the given lane contains an element
15815 // that crosses to another lane.
15816 if (!Subtarget.hasAVX2()) {
15817 bool LaneCrossing[2] = {false, false};
15818 for (int i = 0; i < Size; ++i)
15819 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15820 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15821 if (!LaneCrossing[0] || !LaneCrossing[1])
15822 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15823 } else {
15824 bool LaneUsed[2] = {false, false};
15825 for (int i = 0; i < Size; ++i)
15826 if (Mask[i] >= 0)
15827 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15828 if (!LaneUsed[0] || !LaneUsed[1])
15829 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15830 }
15831
15832 // TODO - we could support shuffling V2 in the Flipped input.
15833 assert(V2.isUndef() &&((V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15834, __PRETTY_FUNCTION__))
15834 "This last part of this routine only works on single input shuffles")((V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15834, __PRETTY_FUNCTION__))
;
15835
15836 SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
15837 for (int i = 0; i < Size; ++i) {
15838 int &M = InLaneMask[i];
15839 if (M < 0)
15840 continue;
15841 if (((M % Size) / LaneSize) != (i / LaneSize))
15842 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15843 }
15844 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&((!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected") ? static_cast<void> (
0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15845, __PRETTY_FUNCTION__))
15845 "In-lane shuffle mask expected")((!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected") ? static_cast<void> (
0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15845, __PRETTY_FUNCTION__))
;
15846
15847 // Flip the lanes, and shuffle the results which should now be in-lane.
15848 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15849 SDValue Flipped = DAG.getBitcast(PVT, V1);
15850 Flipped =
15851 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15852 Flipped = DAG.getBitcast(VT, Flipped);
15853 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15854}
15855
15856/// Handle lowering 2-lane 128-bit shuffles.
15857static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
15858 SDValue V2, ArrayRef<int> Mask,
15859 const APInt &Zeroable,
15860 const X86Subtarget &Subtarget,
15861 SelectionDAG &DAG) {
15862 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15863 if (Subtarget.hasAVX2() && V2.isUndef())
15864 return SDValue();
15865
15866 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15867
15868 SmallVector<int, 4> WidenedMask;
15869 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15870 return SDValue();
15871
15872 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15873 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15874
15875 // Try to use an insert into a zero vector.
15876 if (WidenedMask[0] == 0 && IsHighZero) {
15877 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15878 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15879 DAG.getIntPtrConstant(0, DL));
15880 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15881 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15882 DAG.getIntPtrConstant(0, DL));
15883 }
15884
15885 // TODO: If minimizing size and one of the inputs is a zero vector and the
15886 // the zero vector has only one use, we could use a VPERM2X128 to save the
15887 // instruction bytes needed to explicitly generate the zero vector.
15888
15889 // Blends are faster and handle all the non-lane-crossing cases.
15890 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15891 Subtarget, DAG))
15892 return Blend;
15893
15894 // If either input operand is a zero vector, use VPERM2X128 because its mask
15895 // allows us to replace the zero input with an implicit zero.
15896 if (!IsLowZero && !IsHighZero) {
15897 // Check for patterns which can be matched with a single insert of a 128-bit
15898 // subvector.
15899 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15900 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15901
15902 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15903 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15904 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
15905 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15906 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
15907 OnlyUsesV1 ? V1 : V2,
15908 DAG.getIntPtrConstant(0, DL));
15909 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15910 DAG.getIntPtrConstant(2, DL));
15911 }
15912 }
15913
15914 // Try to use SHUF128 if possible.
15915 if (Subtarget.hasVLX()) {
15916 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15917 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15918 ((WidenedMask[1] % 2) << 1);
15919 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15920 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15921 }
15922 }
15923 }
15924
15925 // Otherwise form a 128-bit permutation. After accounting for undefs,
15926 // convert the 64-bit shuffle mask selection values into 128-bit
15927 // selection bits by dividing the indexes by 2 and shifting into positions
15928 // defined by a vperm2*128 instruction's immediate control byte.
15929
15930 // The immediate permute control byte looks like this:
15931 // [1:0] - select 128 bits from sources for low half of destination
15932 // [2] - ignore
15933 // [3] - zero low half of destination
15934 // [5:4] - select 128 bits from sources for high half of destination
15935 // [6] - ignore
15936 // [7] - zero high half of destination
15937
15938 assert((WidenedMask[0] >= 0 || IsLowZero) &&(((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask
[1] >= 0 || IsHighZero) && "Undef half?") ? static_cast
<void> (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15939, __PRETTY_FUNCTION__))
15939 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask
[1] >= 0 || IsHighZero) && "Undef half?") ? static_cast
<void> (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15939, __PRETTY_FUNCTION__))
;
15940
15941 unsigned PermMask = 0;
15942 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15943 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15944
15945 // Check the immediate mask and replace unused sources with undef.
15946 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15947 V1 = DAG.getUNDEF(VT);
15948 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15949 V2 = DAG.getUNDEF(VT);
15950
15951 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15952 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15953}
15954
15955/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15956/// shuffling each lane.
15957///
15958/// This attempts to create a repeated lane shuffle where each lane uses one
15959/// or two of the lanes of the inputs. The lanes of the input vectors are
15960/// shuffled in one or two independent shuffles to get the lanes into the
15961/// position needed by the final shuffle.
15962static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
15963 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15964 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15965 assert(!V2.isUndef() && "This is only useful with multiple inputs.")((!V2.isUndef() && "This is only useful with multiple inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15965, __PRETTY_FUNCTION__))
;
15966
15967 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15968 return SDValue();
15969
15970 int NumElts = Mask.size();
15971 int NumLanes = VT.getSizeInBits() / 128;
15972 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15973 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15974 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15975
15976 // First pass will try to fill in the RepeatMask from lanes that need two
15977 // sources.
15978 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15979 int Srcs[2] = {-1, -1};
15980 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15981 for (int i = 0; i != NumLaneElts; ++i) {
15982 int M = Mask[(Lane * NumLaneElts) + i];
15983 if (M < 0)
15984 continue;
15985 // Determine which of the possible input lanes (NumLanes from each source)
15986 // this element comes from. Assign that as one of the sources for this
15987 // lane. We can assign up to 2 sources for this lane. If we run out
15988 // sources we can't do anything.
15989 int LaneSrc = M / NumLaneElts;
15990 int Src;
15991 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15992 Src = 0;
15993 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15994 Src = 1;
15995 else
15996 return SDValue();
15997
15998 Srcs[Src] = LaneSrc;
15999 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16000 }
16001
16002 // If this lane has two sources, see if it fits with the repeat mask so far.
16003 if (Srcs[1] < 0)
16004 continue;
16005
16006 LaneSrcs[Lane][0] = Srcs[0];
16007 LaneSrcs[Lane][1] = Srcs[1];
16008
16009 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16010 assert(M1.size() == M2.size() && "Unexpected mask size")((M1.size() == M2.size() && "Unexpected mask size") ?
static_cast<void> (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16010, __PRETTY_FUNCTION__))
;
16011 for (int i = 0, e = M1.size(); i != e; ++i)
16012 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16013 return false;
16014 return true;
16015 };
16016
16017 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16018 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")((Mask.size() == MergedMask.size() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16018, __PRETTY_FUNCTION__))
;
16019 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16020 int M = Mask[i];
16021 if (M < 0)
16022 continue;
16023 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"
) ? static_cast<void> (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16024, __PRETTY_FUNCTION__))
16024 "Unexpected mask element")(((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"
) ? static_cast<void> (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16024, __PRETTY_FUNCTION__))
;
16025 MergedMask[i] = M;
16026 }
16027 };
16028
16029 if (MatchMasks(InLaneMask, RepeatMask)) {
16030 // Merge this lane mask into the final repeat mask.
16031 MergeMasks(InLaneMask, RepeatMask);
16032 continue;
16033 }
16034
16035 // Didn't find a match. Swap the operands and try again.
16036 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16037 ShuffleVectorSDNode::commuteMask(InLaneMask);
16038
16039 if (MatchMasks(InLaneMask, RepeatMask)) {
16040 // Merge this lane mask into the final repeat mask.
16041 MergeMasks(InLaneMask, RepeatMask);
16042 continue;
16043 }
16044
16045 // Couldn't find a match with the operands in either order.
16046 return SDValue();
16047 }
16048
16049 // Now handle any lanes with only one source.
16050 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16051 // If this lane has already been processed, skip it.
16052 if (LaneSrcs[Lane][0] >= 0)
16053 continue;
16054
16055 for (int i = 0; i != NumLaneElts; ++i) {
16056 int M = Mask[(Lane * NumLaneElts) + i];
16057 if (M < 0)
16058 continue;
16059
16060 // If RepeatMask isn't defined yet we can define it ourself.
16061 if (RepeatMask[i] < 0)
16062 RepeatMask[i] = M % NumLaneElts;
16063
16064 if (RepeatMask[i] < NumElts) {
16065 if (RepeatMask[i] != M % NumLaneElts)
16066 return SDValue();
16067 LaneSrcs[Lane][0] = M / NumLaneElts;
16068 } else {
16069 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16070 return SDValue();
16071 LaneSrcs[Lane][1] = M / NumLaneElts;
16072 }
16073 }
16074
16075 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16076 return SDValue();
16077 }
16078
16079 SmallVector<int, 16> NewMask(NumElts, -1);
16080 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16081 int Src = LaneSrcs[Lane][0];
16082 for (int i = 0; i != NumLaneElts; ++i) {
16083 int M = -1;
16084 if (Src >= 0)
16085 M = Src * NumLaneElts + i;
16086 NewMask[Lane * NumLaneElts + i] = M;
16087 }
16088 }
16089 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16090 // Ensure we didn't get back the shuffle we started with.
16091 // FIXME: This is a hack to make up for some splat handling code in
16092 // getVectorShuffle.
16093 if (isa<ShuffleVectorSDNode>(NewV1) &&
16094 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16095 return SDValue();
16096
16097 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16098 int Src = LaneSrcs[Lane][1];
16099 for (int i = 0; i != NumLaneElts; ++i) {
16100 int M = -1;
16101 if (Src >= 0)
16102 M = Src * NumLaneElts + i;
16103 NewMask[Lane * NumLaneElts + i] = M;
16104 }
16105 }
16106 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16107 // Ensure we didn't get back the shuffle we started with.
16108 // FIXME: This is a hack to make up for some splat handling code in
16109 // getVectorShuffle.
16110 if (isa<ShuffleVectorSDNode>(NewV2) &&
16111 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16112 return SDValue();
16113
16114 for (int i = 0; i != NumElts; ++i) {
16115 NewMask[i] = RepeatMask[i % NumLaneElts];
16116 if (NewMask[i] < 0)
16117 continue;
16118
16119 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16120 }
16121 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16122}
16123
16124/// If the input shuffle mask results in a vector that is undefined in all upper
16125/// or lower half elements and that mask accesses only 2 halves of the
16126/// shuffle's operands, return true. A mask of half the width with mask indexes
16127/// adjusted to access the extracted halves of the original shuffle operands is
16128/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16129/// lower half of each input operand is accessed.
16130static bool
16131getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
16132 int &HalfIdx1, int &HalfIdx2) {
16133 assert((Mask.size() == HalfMask.size() * 2) &&(((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"
) ? static_cast<void> (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16134, __PRETTY_FUNCTION__))
16134 "Expected input mask to be twice as long as output")(((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"
) ? static_cast<void> (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16134, __PRETTY_FUNCTION__))
;
16135
16136 // Exactly one half of the result must be undef to allow narrowing.
16137 bool UndefLower = isUndefLowerHalf(Mask);
16138 bool UndefUpper = isUndefUpperHalf(Mask);
16139 if (UndefLower == UndefUpper)
16140 return false;
16141
16142 unsigned HalfNumElts = HalfMask.size();
16143 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16144 HalfIdx1 = -1;
16145 HalfIdx2 = -1;
16146 for (unsigned i = 0; i != HalfNumElts; ++i) {
16147 int M = Mask[i + MaskIndexOffset];
16148 if (M < 0) {
16149 HalfMask[i] = M;
16150 continue;
16151 }
16152
16153 // Determine which of the 4 half vectors this element is from.
16154 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16155 int HalfIdx = M / HalfNumElts;
16156
16157 // Determine the element index into its half vector source.
16158 int HalfElt = M % HalfNumElts;
16159
16160 // We can shuffle with up to 2 half vectors, set the new 'half'
16161 // shuffle mask accordingly.
16162 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16163 HalfMask[i] = HalfElt;
16164 HalfIdx1 = HalfIdx;
16165 continue;
16166 }
16167 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16168 HalfMask[i] = HalfElt + HalfNumElts;
16169 HalfIdx2 = HalfIdx;
16170 continue;
16171 }
16172
16173 // Too many half vectors referenced.
16174 return false;
16175 }
16176
16177 return true;
16178}
16179
16180/// Given the output values from getHalfShuffleMask(), create a half width
16181/// shuffle of extracted vectors followed by an insert back to full width.
16182static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
16183 ArrayRef<int> HalfMask, int HalfIdx1,
16184 int HalfIdx2, bool UndefLower,
16185 SelectionDAG &DAG, bool UseConcat = false) {
16186 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")((V1.getValueType() == V2.getValueType() && "Different sized vectors?"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16186, __PRETTY_FUNCTION__))
;
16187 assert(V1.getValueType().isSimple() && "Expecting only simple types")((V1.getValueType().isSimple() && "Expecting only simple types"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16187, __PRETTY_FUNCTION__))
;
16188
16189 MVT VT = V1.getSimpleValueType();
16190 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16191 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16192
16193 auto getHalfVector = [&](int HalfIdx) {
16194 if (HalfIdx < 0)
16195 return DAG.getUNDEF(HalfVT);
16196 SDValue V = (HalfIdx < 2 ? V1 : V2);
16197 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16198 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16199 DAG.getIntPtrConstant(HalfIdx, DL));
16200 };
16201
16202 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16203 SDValue Half1 = getHalfVector(HalfIdx1);
16204 SDValue Half2 = getHalfVector(HalfIdx2);
16205 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16206 if (UseConcat) {
16207 SDValue Op0 = V;
16208 SDValue Op1 = DAG.getUNDEF(HalfVT);
16209 if (UndefLower)
16210 std::swap(Op0, Op1);
16211 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16212 }
16213
16214 unsigned Offset = UndefLower ? HalfNumElts : 0;
16215 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16216 DAG.getIntPtrConstant(Offset, DL));
16217}
16218
16219/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16220/// This allows for fast cases such as subvector extraction/insertion
16221/// or shuffling smaller vector types which can lower more efficiently.
16222static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
16223 SDValue V2, ArrayRef<int> Mask,
16224 const X86Subtarget &Subtarget,
16225 SelectionDAG &DAG) {
16226 assert((VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16227, __PRETTY_FUNCTION__))
16227 "Expected 256-bit or 512-bit vector")(((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16227, __PRETTY_FUNCTION__))
;
16228
16229 bool UndefLower = isUndefLowerHalf(Mask);
16230 if (!UndefLower && !isUndefUpperHalf(Mask))
16231 return SDValue();
16232
16233 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? static_cast<void> (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16234, __PRETTY_FUNCTION__))
16234 "Completely undef shuffle mask should have been simplified already")(((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? static_cast<void> (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16234, __PRETTY_FUNCTION__))
;
16235
16236 // Upper half is undef and lower half is whole upper subvector.
16237 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16238 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16239 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16240 if (!UndefLower &&
16241 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16242 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16243 DAG.getIntPtrConstant(HalfNumElts, DL));
16244 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16245 DAG.getIntPtrConstant(0, DL));
16246 }
16247
16248 // Lower half is undef and upper half is whole lower subvector.
16249 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16250 if (UndefLower &&
16251 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16252 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16253 DAG.getIntPtrConstant(0, DL));
16254 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16255 DAG.getIntPtrConstant(HalfNumElts, DL));
16256 }
16257
16258 int HalfIdx1, HalfIdx2;
16259 SmallVector<int, 8> HalfMask(HalfNumElts);
16260 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16261 return SDValue();
16262
16263 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")((HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"
) ? static_cast<void> (0) : __assert_fail ("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16263, __PRETTY_FUNCTION__))
;
16264
16265 // Only shuffle the halves of the inputs when useful.
16266 unsigned NumLowerHalves =
16267 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16268 unsigned NumUpperHalves =
16269 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16270 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")((NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed"
) ? static_cast<void> (0) : __assert_fail ("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16270, __PRETTY_FUNCTION__))
;
16271
16272 // Determine the larger pattern of undef/halves, then decide if it's worth
16273 // splitting the shuffle based on subtarget capabilities and types.
16274 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16275 if (!UndefLower) {
16276 // XXXXuuuu: no insert is needed.
16277 // Always extract lowers when setting lower - these are all free subreg ops.
16278 if (NumUpperHalves == 0)
16279 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16280 UndefLower, DAG);
16281
16282 if (NumUpperHalves == 1) {
16283 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16284 if (Subtarget.hasAVX2()) {
16285 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16286 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16287 !is128BitUnpackShuffleMask(HalfMask) &&
16288 (!isSingleSHUFPSMask(HalfMask) ||
16289 Subtarget.hasFastVariableShuffle()))
16290 return SDValue();
16291 // If this is a unary shuffle (assume that the 2nd operand is
16292 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16293 // are better off extracting the upper half of 1 operand and using a
16294 // narrow shuffle.
16295 if (EltWidth == 64 && V2.isUndef())
16296 return SDValue();
16297 }
16298 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16299 if (Subtarget.hasAVX512() && VT.is512BitVector())
16300 return SDValue();
16301 // Extract + narrow shuffle is better than the wide alternative.
16302 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16303 UndefLower, DAG);
16304 }
16305
16306 // Don't extract both uppers, instead shuffle and then extract.
16307 assert(NumUpperHalves == 2 && "Half vector count went wrong")((NumUpperHalves == 2 && "Half vector count went wrong"
) ? static_cast<void> (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16307, __PRETTY_FUNCTION__))
;
16308 return SDValue();
16309 }
16310
16311 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16312 if (NumUpperHalves == 0) {
16313 // AVX2 has efficient 64-bit element cross-lane shuffles.
16314 // TODO: Refine to account for unary shuffle, splat, and other masks?
16315 if (Subtarget.hasAVX2() && EltWidth == 64)
16316 return SDValue();
16317 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16318 if (Subtarget.hasAVX512() && VT.is512BitVector())
16319 return SDValue();
16320 // Narrow shuffle + insert is better than the wide alternative.
16321 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16322 UndefLower, DAG);
16323 }
16324
16325 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16326 return SDValue();
16327}
16328
16329/// Test whether the specified input (0 or 1) is in-place blended by the
16330/// given mask.
16331///
16332/// This returns true if the elements from a particular input are already in the
16333/// slot required by the given mask and require no permutation.
16334static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
16335 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(((Input == 0 || Input == 1) && "Only two inputs to shuffles."
) ? static_cast<void> (0) : __assert_fail ("(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16335, __PRETTY_FUNCTION__))
;
16336 int Size = Mask.size();
16337 for (int i = 0; i < Size; ++i)
16338 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
16339 return false;
16340
16341 return true;
16342}
16343
16344/// Handle case where shuffle sources are coming from the same 128-bit lane and
16345/// every lane can be represented as the same repeating mask - allowing us to
16346/// shuffle the sources with the repeating shuffle and then permute the result
16347/// to the destination lanes.
16348static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
16349 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16350 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16351 int NumElts = VT.getVectorNumElements();
16352 int NumLanes = VT.getSizeInBits() / 128;
16353 int NumLaneElts = NumElts / NumLanes;
16354
16355 // On AVX2 we may be able to just shuffle the lowest elements and then
16356 // broadcast the result.
16357 if (Subtarget.hasAVX2()) {
16358 for (unsigned BroadcastSize : {16, 32, 64}) {
16359 if (BroadcastSize <= VT.getScalarSizeInBits())
16360 continue;
16361 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16362
16363 // Attempt to match a repeating pattern every NumBroadcastElts,
16364 // accounting for UNDEFs but only references the lowest 128-bit
16365 // lane of the inputs.
16366 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16367 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16368 for (int j = 0; j != NumBroadcastElts; ++j) {
16369 int M = Mask[i + j];
16370 if (M < 0)
16371 continue;
16372 int &R = RepeatMask[j];
16373 if (0 != ((M % NumElts) / NumLaneElts))
16374 return false;
16375 if (0 <= R && R != M)
16376 return false;
16377 R = M;
16378 }
16379 return true;
16380 };
16381
16382 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16383 if (!FindRepeatingBroadcastMask(RepeatMask))
16384 continue;
16385
16386 // Shuffle the (lowest) repeated elements in place for broadcast.
16387 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16388
16389 // Shuffle the actual broadcast.
16390 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16391 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16392 for (int j = 0; j != NumBroadcastElts; ++j)
16393 BroadcastMask[i + j] = j;
16394 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16395 BroadcastMask);
16396 }
16397 }
16398
16399 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16400 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16401 return SDValue();
16402
16403 // Bail if we already have a repeated lane shuffle mask.
16404 SmallVector<int, 8> RepeatedShuffleMask;
16405 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
16406 return SDValue();
16407
16408 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16409 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
16410 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
16411 int NumSubLanes = NumLanes * SubLaneScale;
16412 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16413
16414 // Check that all the sources are coming from the same lane and see if we can
16415 // form a repeating shuffle mask (local to each sub-lane). At the same time,
16416 // determine the source sub-lane for each destination sub-lane.
16417 int TopSrcSubLane = -1;
16418 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16419 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
16420 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
16421 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
16422
16423 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16424 // Extract the sub-lane mask, check that it all comes from the same lane
16425 // and normalize the mask entries to come from the first lane.
16426 int SrcLane = -1;
16427 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16428 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16429 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16430 if (M < 0)
16431 continue;
16432 int Lane = (M % NumElts) / NumLaneElts;
16433 if ((0 <= SrcLane) && (SrcLane != Lane))
16434 return SDValue();
16435 SrcLane = Lane;
16436 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16437 SubLaneMask[Elt] = LocalM;
16438 }
16439
16440 // Whole sub-lane is UNDEF.
16441 if (SrcLane < 0)
16442 continue;
16443
16444 // Attempt to match against the candidate repeated sub-lane masks.
16445 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16446 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16447 for (int i = 0; i != NumSubLaneElts; ++i) {
16448 if (M1[i] < 0 || M2[i] < 0)
16449 continue;
16450 if (M1[i] != M2[i])
16451 return false;
16452 }
16453 return true;
16454 };
16455
16456 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16457 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16458 continue;
16459
16460 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16461 for (int i = 0; i != NumSubLaneElts; ++i) {
16462 int M = SubLaneMask[i];
16463 if (M < 0)
16464 continue;
16465 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] ==
M) && "Unexpected mask element") ? static_cast<void
> (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16466, __PRETTY_FUNCTION__))
16466 "Unexpected mask element")(((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] ==
M) && "Unexpected mask element") ? static_cast<void
> (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16466, __PRETTY_FUNCTION__))
;
16467 RepeatedSubLaneMask[i] = M;
16468 }
16469
16470 // Track the top most source sub-lane - by setting the remaining to UNDEF
16471 // we can greatly simplify shuffle matching.
16472 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16473 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16474 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16475 break;
16476 }
16477
16478 // Bail if we failed to find a matching repeated sub-lane mask.
16479 if (Dst2SrcSubLanes[DstSubLane] < 0)
16480 return SDValue();
16481 }
16482 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes
&& "Unexpected source lane") ? static_cast<void>
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16483, __PRETTY_FUNCTION__))
16483 "Unexpected source lane")((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes
&& "Unexpected source lane") ? static_cast<void>
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16483, __PRETTY_FUNCTION__))
;
16484
16485 // Create a repeating shuffle mask for the entire vector.
16486 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16487 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16488 int Lane = SubLane / SubLaneScale;
16489 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16490 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16491 int M = RepeatedSubLaneMask[Elt];
16492 if (M < 0)
16493 continue;
16494 int Idx = (SubLane * NumSubLaneElts) + Elt;
16495 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16496 }
16497 }
16498 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16499
16500 // Shuffle each source sub-lane to its destination.
16501 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16502 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16503 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16504 if (SrcSubLane < 0)
16505 continue;
16506 for (int j = 0; j != NumSubLaneElts; ++j)
16507 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16508 }
16509
16510 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16511 SubLaneMask);
16512}
16513
16514static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
16515 bool &ForceV1Zero, bool &ForceV2Zero,
16516 unsigned &ShuffleImm, ArrayRef<int> Mask,
16517 const APInt &Zeroable) {
16518 int NumElts = VT.getVectorNumElements();
16519 assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16521, __PRETTY_FUNCTION__))
16520 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16521, __PRETTY_FUNCTION__))
16521 "Unexpected data type for VSHUFPD")((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16521, __PRETTY_FUNCTION__))
;
16522 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&((isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16523, __PRETTY_FUNCTION__))
16523 "Illegal shuffle mask")((isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16523, __PRETTY_FUNCTION__))
;
16524
16525 bool ZeroLane[2] = { true, true };
16526 for (int i = 0; i < NumElts; ++i)
16527 ZeroLane[i & 1] &= Zeroable[i];
16528
16529 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16530 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16531 ShuffleImm = 0;
16532 bool ShufpdMask = true;
16533 bool CommutableMask = true;
16534 for (int i = 0; i < NumElts; ++i) {
16535 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16536 continue;
16537 if (Mask[i] < 0)
16538 return false;
16539 int Val = (i & 6) + NumElts * (i & 1);
16540 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16541 if (Mask[i] < Val || Mask[i] > Val + 1)
16542 ShufpdMask = false;
16543 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16544 CommutableMask = false;
16545 ShuffleImm |= (Mask[i] % 2) << i;
16546 }
16547
16548 if (!ShufpdMask && !CommutableMask)
16549 return false;
16550
16551 if (!ShufpdMask && CommutableMask)
16552 std::swap(V1, V2);
16553
16554 ForceV1Zero = ZeroLane[0];
16555 ForceV2Zero = ZeroLane[1];
16556 return true;
16557}
16558
16559static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
16560 SDValue V2, ArrayRef<int> Mask,
16561 const APInt &Zeroable,
16562 const X86Subtarget &Subtarget,
16563 SelectionDAG &DAG) {
16564 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD") ? static_cast<void>
(0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16565, __PRETTY_FUNCTION__))
16565 "Unexpected data type for VSHUFPD")(((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD") ? static_cast<void>
(0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16565, __PRETTY_FUNCTION__))
;
16566
16567 unsigned Immediate = 0;
16568 bool ForceV1Zero = false, ForceV2Zero = false;
16569 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16570 Mask, Zeroable))
16571 return SDValue();
16572
16573 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16574 if (ForceV1Zero)
16575 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16576 if (ForceV2Zero)
16577 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16578
16579 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16580 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16581}
16582
16583// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16584// by zeroable elements in the remaining 24 elements. Turn this into two
16585// vmovqb instructions shuffled together.
16586static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
16587 SDValue V1, SDValue V2,
16588 ArrayRef<int> Mask,
16589 const APInt &Zeroable,
16590 SelectionDAG &DAG) {
16591 assert(VT == MVT::v32i8 && "Unexpected type!")((VT == MVT::v32i8 && "Unexpected type!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16591, __PRETTY_FUNCTION__))
;
16592
16593 // The first 8 indices should be every 8th element.
16594 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16595 return SDValue();
16596
16597 // Remaining elements need to be zeroable.
16598 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
16599 return SDValue();
16600
16601 V1 = DAG.getBitcast(MVT::v4i64, V1);
16602 V2 = DAG.getBitcast(MVT::v4i64, V2);
16603
16604 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16605 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16606
16607 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16608 // the upper bits of the result using an unpckldq.
16609 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16610 { 0, 1, 2, 3, 16, 17, 18, 19,
16611 4, 5, 6, 7, 20, 21, 22, 23 });
16612 // Insert the unpckldq into a zero vector to widen to v32i8.
16613 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16614 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16615 DAG.getIntPtrConstant(0, DL));
16616}
16617
16618
16619/// Handle lowering of 4-lane 64-bit floating point shuffles.
16620///
16621/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16622/// isn't available.
16623static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16624 const APInt &Zeroable, SDValue V1, SDValue V2,
16625 const X86Subtarget &Subtarget,
16626 SelectionDAG &DAG) {
16627 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16627, __PRETTY_FUNCTION__))
;
16628 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16628, __PRETTY_FUNCTION__))
;
16629 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16629, __PRETTY_FUNCTION__))
;
16630
16631 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16632 Subtarget, DAG))
16633 return V;
16634
16635 if (V2.isUndef()) {
16636 // Check for being able to broadcast a single element.
16637 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16638 Mask, Subtarget, DAG))
16639 return Broadcast;
16640
16641 // Use low duplicate instructions for masks that match their pattern.
16642 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16643 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16644
16645 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16646 // Non-half-crossing single input shuffles can be lowered with an
16647 // interleaved permutation.
16648 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16649 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16650 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16651 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16652 }
16653
16654 // With AVX2 we have direct support for this permutation.
16655 if (Subtarget.hasAVX2())
16656 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16657 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16658
16659 // Try to create an in-lane repeating shuffle mask and then shuffle the
16660 // results into the target lanes.
16661 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16662 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16663 return V;
16664
16665 // Try to permute the lanes and then use a per-lane permute.
16666 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16667 Mask, DAG, Subtarget))
16668 return V;
16669
16670 // Otherwise, fall back.
16671 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16672 DAG, Subtarget);
16673 }
16674
16675 // Use dedicated unpack instructions for masks that match their pattern.
16676 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
16677 return V;
16678
16679 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16680 Zeroable, Subtarget, DAG))
16681 return Blend;
16682
16683 // Check if the blend happens to exactly fit that of SHUFPD.
16684 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16685 Zeroable, Subtarget, DAG))
16686 return Op;
16687
16688 // If we have lane crossing shuffles AND they don't all come from the lower
16689 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16690 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16691 // canonicalize to a blend of splat which isn't necessary for this combine.
16692 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16693 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16694 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16695 (V2.getOpcode() != ISD::BUILD_VECTOR))
16696 if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
16697 Mask, DAG))
16698 return Op;
16699
16700 // If we have one input in place, then we can permute the other input and
16701 // blend the result.
16702 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16703 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16704 Subtarget, DAG);
16705
16706 // Try to create an in-lane repeating shuffle mask and then shuffle the
16707 // results into the target lanes.
16708 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16709 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16710 return V;
16711
16712 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16713 // shuffle. However, if we have AVX2 and either inputs are already in place,
16714 // we will be able to shuffle even across lanes the other input in a single
16715 // instruction so skip this pattern.
16716 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
16717 isShuffleMaskInputInPlace(1, Mask))))
16718 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
16719 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16720 return V;
16721
16722 // If we have VLX support, we can use VEXPAND.
16723 if (Subtarget.hasVLX())
16724 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
16725 DAG, Subtarget))
16726 return V;
16727
16728 // If we have AVX2 then we always want to lower with a blend because an v4 we
16729 // can fully permute the elements.
16730 if (Subtarget.hasAVX2())
16731 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16732 Subtarget, DAG);
16733
16734 // Otherwise fall back on generic lowering.
16735 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
16736 Subtarget, DAG);
16737}
16738
16739/// Handle lowering of 4-lane 64-bit integer shuffles.
16740///
16741/// This routine is only called when we have AVX2 and thus a reasonable
16742/// instruction set for v4i64 shuffling..
16743static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16744 const APInt &Zeroable, SDValue V1, SDValue V2,
16745 const X86Subtarget &Subtarget,
16746 SelectionDAG &DAG) {
16747 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16747, __PRETTY_FUNCTION__))
;
16748 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16748, __PRETTY_FUNCTION__))
;
16749 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16749, __PRETTY_FUNCTION__))
;
16750 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16750, __PRETTY_FUNCTION__))
;
16751
16752 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16753 Subtarget, DAG))
16754 return V;
16755
16756 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16757 Zeroable, Subtarget, DAG))
16758 return Blend;
16759
16760 // Check for being able to broadcast a single element.
16761 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16762 Subtarget, DAG))
16763 return Broadcast;
16764
16765 if (V2.isUndef()) {
16766 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16767 // can use lower latency instructions that will operate on both lanes.
16768 SmallVector<int, 2> RepeatedMask;
16769 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16770 SmallVector<int, 4> PSHUFDMask;
16771 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16772 return DAG.getBitcast(
16773 MVT::v4i64,
16774 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16775 DAG.getBitcast(MVT::v8i32, V1),
16776 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16777 }
16778
16779 // AVX2 provides a direct instruction for permuting a single input across
16780 // lanes.
16781 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16782 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16783 }
16784
16785 // Try to use shift instructions.
16786 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
16787 Zeroable, Subtarget, DAG))
16788 return Shift;
16789
16790 // If we have VLX support, we can use VALIGN or VEXPAND.
16791 if (Subtarget.hasVLX()) {
16792 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16793 Subtarget, DAG))
16794 return Rotate;
16795
16796 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
16797 DAG, Subtarget))
16798 return V;
16799 }
16800
16801 // Try to use PALIGNR.
16802 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16803 Subtarget, DAG))
16804 return Rotate;
16805
16806 // Use dedicated unpack instructions for masks that match their pattern.
16807 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
16808 return V;
16809
16810 // If we have one input in place, then we can permute the other input and
16811 // blend the result.
16812 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16813 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16814 Subtarget, DAG);
16815
16816 // Try to create an in-lane repeating shuffle mask and then shuffle the
16817 // results into the target lanes.
16818 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16819 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16820 return V;
16821
16822 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16823 // shuffle. However, if we have AVX2 and either inputs are already in place,
16824 // we will be able to shuffle even across lanes the other input in a single
16825 // instruction so skip this pattern.
16826 if (!isShuffleMaskInputInPlace(0, Mask) &&
16827 !isShuffleMaskInputInPlace(1, Mask))
16828 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16829 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16830 return Result;
16831
16832 // Otherwise fall back on generic blend lowering.
16833 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16834 Subtarget, DAG);
16835}
16836
16837/// Handle lowering of 8-lane 32-bit floating point shuffles.
16838///
16839/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16840/// isn't available.
16841static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16842 const APInt &Zeroable, SDValue V1, SDValue V2,
16843 const X86Subtarget &Subtarget,
16844 SelectionDAG &DAG) {
16845 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16845, __PRETTY_FUNCTION__))
;
16846 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16846, __PRETTY_FUNCTION__))
;
16847 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16847, __PRETTY_FUNCTION__))
;
16848
16849 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16850 Zeroable, Subtarget, DAG))
16851 return Blend;
16852
16853 // Check for being able to broadcast a single element.
16854 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16855 Subtarget, DAG))
16856 return Broadcast;
16857
16858 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16859 // options to efficiently lower the shuffle.
16860 SmallVector<int, 4> RepeatedMask;
16861 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16862 assert(RepeatedMask.size() == 4 &&((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16863, __PRETTY_FUNCTION__))
16863 "Repeated masks must be half the mask width!")((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16863, __PRETTY_FUNCTION__))
;
16864
16865 // Use even/odd duplicate instructions for masks that match their pattern.
16866 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16867 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16868 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16869 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16870
16871 if (V2.isUndef())
16872 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16873 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16874
16875 // Use dedicated unpack instructions for masks that match their pattern.
16876 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
16877 return V;
16878
16879 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16880 // have already handled any direct blends.
16881 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16882 }
16883
16884 // Try to create an in-lane repeating shuffle mask and then shuffle the
16885 // results into the target lanes.
16886 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16887 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16888 return V;
16889
16890 // If we have a single input shuffle with different shuffle patterns in the
16891 // two 128-bit lanes use the variable mask to VPERMILPS.
16892 if (V2.isUndef()) {
16893 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16894 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16895 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16896 }
16897 if (Subtarget.hasAVX2()) {
16898 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16899 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16900 }
16901 // Otherwise, fall back.
16902 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16903 DAG, Subtarget);
16904 }
16905
16906 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16907 // shuffle.
16908 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16909 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16910 return Result;
16911
16912 // If we have VLX support, we can use VEXPAND.
16913 if (Subtarget.hasVLX())
16914 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
16915 DAG, Subtarget))
16916 return V;
16917
16918 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16919 // since after split we get a more efficient code using vpunpcklwd and
16920 // vpunpckhwd instrs than vblend.
16921 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
16922 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
16923 DAG);
16924
16925 // If we have AVX2 then we always want to lower with a blend because at v8 we
16926 // can fully permute the elements.
16927 if (Subtarget.hasAVX2())
16928 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16929 Subtarget, DAG);
16930
16931 // Otherwise fall back on generic lowering.
16932 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16933 Subtarget, DAG);
16934}
16935
16936/// Handle lowering of 8-lane 32-bit integer shuffles.
16937///
16938/// This routine is only called when we have AVX2 and thus a reasonable
16939/// instruction set for v8i32 shuffling..
16940static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16941 const APInt &Zeroable, SDValue V1, SDValue V2,
16942 const X86Subtarget &Subtarget,
16943 SelectionDAG &DAG) {
16944 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16944, __PRETTY_FUNCTION__))
;
16945 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16945, __PRETTY_FUNCTION__))
;
16946 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16946, __PRETTY_FUNCTION__))
;
16947 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16947, __PRETTY_FUNCTION__))
;
16948
16949 // Whenever we can lower this as a zext, that instruction is strictly faster
16950 // than any alternative. It also allows us to fold memory operands into the
16951 // shuffle in many cases.
16952 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16953 Zeroable, Subtarget, DAG))
16954 return ZExt;
16955
16956 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16957 // since after split we get a more efficient code than vblend by using
16958 // vpunpcklwd and vpunpckhwd instrs.
16959 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
16960 !Subtarget.hasAVX512())
16961 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
16962 DAG);
16963
16964 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16965 Zeroable, Subtarget, DAG))
16966 return Blend;
16967
16968 // Check for being able to broadcast a single element.
16969 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16970 Subtarget, DAG))
16971 return Broadcast;
16972
16973 // If the shuffle mask is repeated in each 128-bit lane we can use more
16974 // efficient instructions that mirror the shuffles across the two 128-bit
16975 // lanes.
16976 SmallVector<int, 4> RepeatedMask;
16977 bool Is128BitLaneRepeatedShuffle =
16978 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16979 if (Is128BitLaneRepeatedShuffle) {
16980 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16980, __PRETTY_FUNCTION__))
;
16981 if (V2.isUndef())
16982 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16983 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16984
16985 // Use dedicated unpack instructions for masks that match their pattern.
16986 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
16987 return V;
16988 }
16989
16990 // Try to use shift instructions.
16991 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
16992 Zeroable, Subtarget, DAG))
16993 return Shift;
16994
16995 // If we have VLX support, we can use VALIGN or EXPAND.
16996 if (Subtarget.hasVLX()) {
16997 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16998 Subtarget, DAG))
16999 return Rotate;
17000
17001 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17002 DAG, Subtarget))
17003 return V;
17004 }
17005
17006 // Try to use byte rotation instructions.
17007 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17008 Subtarget, DAG))
17009 return Rotate;
17010
17011 // Try to create an in-lane repeating shuffle mask and then shuffle the
17012 // results into the target lanes.
17013 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17014 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17015 return V;
17016
17017 if (V2.isUndef()) {
17018 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17019 // because that should be faster than the variable permute alternatives.
17020 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17021 return V;
17022
17023 // If the shuffle patterns aren't repeated but it's a single input, directly
17024 // generate a cross-lane VPERMD instruction.
17025 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17026 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17027 }
17028
17029 // Assume that a single SHUFPS is faster than an alternative sequence of
17030 // multiple instructions (even if the CPU has a domain penalty).
17031 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17032 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17033 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17034 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17035 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17036 CastV1, CastV2, DAG);
17037 return DAG.getBitcast(MVT::v8i32, ShufPS);
17038 }
17039
17040 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17041 // shuffle.
17042 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17043 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17044 return Result;
17045
17046 // Otherwise fall back on generic blend lowering.
17047 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17048 Subtarget, DAG);
17049}
17050
17051/// Handle lowering of 16-lane 16-bit integer shuffles.
17052///
17053/// This routine is only called when we have AVX2 and thus a reasonable
17054/// instruction set for v16i16 shuffling..
17055static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17056 const APInt &Zeroable, SDValue V1, SDValue V2,
17057 const X86Subtarget &Subtarget,
17058 SelectionDAG &DAG) {
17059 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17059, __PRETTY_FUNCTION__))
;
17060 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17060, __PRETTY_FUNCTION__))
;
17061 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17061, __PRETTY_FUNCTION__))
;
17062 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17062, __PRETTY_FUNCTION__))
;
17063
17064 // Whenever we can lower this as a zext, that instruction is strictly faster
17065 // than any alternative. It also allows us to fold memory operands into the
17066 // shuffle in many cases.
17067 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17068 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17069 return ZExt;
17070
17071 // Check for being able to broadcast a single element.
17072 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17073 Subtarget, DAG))
17074 return Broadcast;
17075
17076 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17077 Zeroable, Subtarget, DAG))
17078 return Blend;
17079
17080 // Use dedicated unpack instructions for masks that match their pattern.
17081 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17082 return V;
17083
17084 // Use dedicated pack instructions for masks that match their pattern.
17085 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17086 Subtarget))
17087 return V;
17088
17089 // Try to use lower using a truncation.
17090 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17091 Subtarget, DAG))
17092 return V;
17093
17094 // Try to use shift instructions.
17095 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17096 Zeroable, Subtarget, DAG))
17097 return Shift;
17098
17099 // Try to use byte rotation instructions.
17100 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17101 Subtarget, DAG))
17102 return Rotate;
17103
17104 // Try to create an in-lane repeating shuffle mask and then shuffle the
17105 // results into the target lanes.
17106 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17107 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17108 return V;
17109
17110 if (V2.isUndef()) {
17111 // Try to use bit rotation instructions.
17112 if (SDValue Rotate =
17113 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17114 return Rotate;
17115
17116 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17117 // because that should be faster than the variable permute alternatives.
17118 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17119 return V;
17120
17121 // There are no generalized cross-lane shuffle operations available on i16
17122 // element types.
17123 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17124 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17125 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17126 return V;
17127
17128 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17129 DAG, Subtarget);
17130 }
17131
17132 SmallVector<int, 8> RepeatedMask;
17133 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17134 // As this is a single-input shuffle, the repeated mask should be
17135 // a strictly valid v8i16 mask that we can pass through to the v8i16
17136 // lowering to handle even the v16 case.
17137 return lowerV8I16GeneralSingleInputShuffle(
17138 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17139 }
17140 }
17141
17142 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17143 Zeroable, Subtarget, DAG))
17144 return PSHUFB;
17145
17146 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17147 if (Subtarget.hasBWI())
17148 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17149
17150 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17151 // shuffle.
17152 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17153 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17154 return Result;
17155
17156 // Try to permute the lanes and then use a per-lane permute.
17157 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17158 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17159 return V;
17160
17161 // Otherwise fall back on generic lowering.
17162 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
17163 Subtarget, DAG);
17164}
17165
17166/// Handle lowering of 32-lane 8-bit integer shuffles.
17167///
17168/// This routine is only called when we have AVX2 and thus a reasonable
17169/// instruction set for v32i8 shuffling..
17170static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17171 const APInt &Zeroable, SDValue V1, SDValue V2,
17172 const X86Subtarget &Subtarget,
17173 SelectionDAG &DAG) {
17174 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17174, __PRETTY_FUNCTION__))
;
17175 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17175, __PRETTY_FUNCTION__))
;
17176 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17176, __PRETTY_FUNCTION__))
;
17177 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17177, __PRETTY_FUNCTION__))
;
17178
17179 // Whenever we can lower this as a zext, that instruction is strictly faster
17180 // than any alternative. It also allows us to fold memory operands into the
17181 // shuffle in many cases.
17182 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17183 Zeroable, Subtarget, DAG))
17184 return ZExt;
17185
17186 // Check for being able to broadcast a single element.
17187 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17188 Subtarget, DAG))
17189 return Broadcast;
17190
17191 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17192 Zeroable, Subtarget, DAG))
17193 return Blend;
17194
17195 // Use dedicated unpack instructions for masks that match their pattern.
17196 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
17197 return V;
17198
17199 // Use dedicated pack instructions for masks that match their pattern.
17200 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
17201 Subtarget))
17202 return V;
17203
17204 // Try to use lower using a truncation.
17205 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17206 Subtarget, DAG))
17207 return V;
17208
17209 // Try to use shift instructions.
17210 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
17211 Zeroable, Subtarget, DAG))
17212 return Shift;
17213
17214 // Try to use byte rotation instructions.
17215 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17216 Subtarget, DAG))
17217 return Rotate;
17218
17219 // Try to use bit rotation instructions.
17220 if (V2.isUndef())
17221 if (SDValue Rotate =
17222 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17223 return Rotate;
17224
17225 // Try to create an in-lane repeating shuffle mask and then shuffle the
17226 // results into the target lanes.
17227 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17228 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17229 return V;
17230
17231 // There are no generalized cross-lane shuffle operations available on i8
17232 // element types.
17233 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17234 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17235 // because that should be faster than the variable permute alternatives.
17236 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
17237 return V;
17238
17239 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17240 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17241 return V;
17242
17243 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17244 DAG, Subtarget);
17245 }
17246
17247 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17248 Zeroable, Subtarget, DAG))
17249 return PSHUFB;
17250
17251 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17252 if (Subtarget.hasVBMI())
17253 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17254
17255 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17256 // shuffle.
17257 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17258 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17259 return Result;
17260
17261 // Try to permute the lanes and then use a per-lane permute.
17262 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17263 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17264 return V;
17265
17266 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17267 // by zeroable elements in the remaining 24 elements. Turn this into two
17268 // vmovqb instructions shuffled together.
17269 if (Subtarget.hasVLX())
17270 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17271 Mask, Zeroable, DAG))
17272 return V;
17273
17274 // Otherwise fall back on generic lowering.
17275 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
17276 Subtarget, DAG);
17277}
17278
17279/// High-level routine to lower various 256-bit x86 vector shuffles.
17280///
17281/// This routine either breaks down the specific type of a 256-bit x86 vector
17282/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17283/// together based on the available instructions.
17284static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
17285 SDValue V1, SDValue V2, const APInt &Zeroable,
17286 const X86Subtarget &Subtarget,
17287 SelectionDAG &DAG) {
17288 // If we have a single input to the zero element, insert that into V1 if we
17289 // can do so cheaply.
17290 int NumElts = VT.getVectorNumElements();
17291 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17292
17293 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17294 if (SDValue Insertion = lowerShuffleAsElementInsertion(
17295 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17296 return Insertion;
17297
17298 // Handle special cases where the lower or upper half is UNDEF.
17299 if (SDValue V =
17300 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17301 return V;
17302
17303 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17304 // can check for those subtargets here and avoid much of the subtarget
17305 // querying in the per-vector-type lowering routines. With AVX1 we have
17306 // essentially *zero* ability to manipulate a 256-bit vector with integer
17307 // types. Since we'll use floating point types there eventually, just
17308 // immediately cast everything to a float and operate entirely in that domain.
17309 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17310 int ElementBits = VT.getScalarSizeInBits();
17311 if (ElementBits < 32) {
17312 // No floating point type available, if we can't use the bit operations
17313 // for masking/blending then decompose into 128-bit vectors.
17314 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17315 Subtarget, DAG))
17316 return V;
17317 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17318 return V;
17319 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17320 }
17321
17322 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17323 VT.getVectorNumElements());
17324 V1 = DAG.getBitcast(FpVT, V1);
17325 V2 = DAG.getBitcast(FpVT, V2);
17326 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17327 }
17328
17329 switch (VT.SimpleTy) {
17330 case MVT::v4f64:
17331 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17332 case MVT::v4i64:
17333 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17334 case MVT::v8f32:
17335 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17336 case MVT::v8i32:
17337 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17338 case MVT::v16i16:
17339 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17340 case MVT::v32i8:
17341 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17342
17343 default:
17344 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17344)
;
17345 }
17346}
17347
17348/// Try to lower a vector shuffle as a 128-bit shuffles.
17349static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
17350 const APInt &Zeroable, SDValue V1, SDValue V2,
17351 const X86Subtarget &Subtarget,
17352 SelectionDAG &DAG) {
17353 assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17354, __PRETTY_FUNCTION__))
17354 "Unexpected element type size for 128bit shuffle.")((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17354, __PRETTY_FUNCTION__))
;
17355
17356 // To handle 256 bit vector requires VLX and most probably
17357 // function lowerV2X128VectorShuffle() is better solution.
17358 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")((VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17358, __PRETTY_FUNCTION__))
;
17359
17360 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17361 SmallVector<int, 4> Widened128Mask;
17362 if (!canWidenShuffleElements(Mask, Widened128Mask))
17363 return SDValue();
17364 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")((Widened128Mask.size() == 4 && "Shuffle widening mismatch"
) ? static_cast<void> (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17364, __PRETTY_FUNCTION__))
;
17365
17366 // Try to use an insert into a zero vector.
17367 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17368 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17369 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17370 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17371 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17372 DAG.getIntPtrConstant(0, DL));
17373 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17374 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17375 DAG.getIntPtrConstant(0, DL));
17376 }
17377
17378 // Check for patterns which can be matched with a single insert of a 256-bit
17379 // subvector.
17380 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17381 if (OnlyUsesV1 ||
17382 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17383 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17384 SDValue SubVec =
17385 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17386 DAG.getIntPtrConstant(0, DL));
17387 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17388 DAG.getIntPtrConstant(4, DL));
17389 }
17390
17391 // See if this is an insertion of the lower 128-bits of V2 into V1.
17392 bool IsInsert = true;
17393 int V2Index = -1;
17394 for (int i = 0; i < 4; ++i) {
17395 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")((Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value"
) ? static_cast<void> (0) : __assert_fail ("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17395, __PRETTY_FUNCTION__))
;
17396 if (Widened128Mask[i] < 0)
17397 continue;
17398
17399 // Make sure all V1 subvectors are in place.
17400 if (Widened128Mask[i] < 4) {
17401 if (Widened128Mask[i] != i) {
17402 IsInsert = false;
17403 break;
17404 }
17405 } else {
17406 // Make sure we only have a single V2 index and its the lowest 128-bits.
17407 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17408 IsInsert = false;
17409 break;
17410 }
17411 V2Index = i;
17412 }
17413 }
17414 if (IsInsert && V2Index >= 0) {
17415 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17416 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17417 DAG.getIntPtrConstant(0, DL));
17418 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17419 }
17420
17421 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17422 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17423 // possible we at least ensure the lanes stay sequential to help later
17424 // combines.
17425 SmallVector<int, 2> Widened256Mask;
17426 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17427 Widened128Mask.clear();
17428 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17429 }
17430
17431 // Try to lower to vshuf64x2/vshuf32x4.
17432 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17433 unsigned PermMask = 0;
17434 // Insure elements came from the same Op.
17435 for (int i = 0; i < 4; ++i) {
17436 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")((Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value"
) ? static_cast<void> (0) : __assert_fail ("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17436, __PRETTY_FUNCTION__))
;
17437 if (Widened128Mask[i] < 0)
17438 continue;
17439
17440 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17441 unsigned OpIndex = i / 2;
17442 if (Ops[OpIndex].isUndef())
17443 Ops[OpIndex] = Op;
17444 else if (Ops[OpIndex] != Op)
17445 return SDValue();
17446
17447 // Convert the 128-bit shuffle mask selection values into 128-bit selection
17448 // bits defined by a vshuf64x2 instruction's immediate control byte.
17449 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
17450 }
17451
17452 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17453 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17454}
17455
17456/// Handle lowering of 8-lane 64-bit floating point shuffles.
17457static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17458 const APInt &Zeroable, SDValue V1, SDValue V2,
17459 const X86Subtarget &Subtarget,
17460 SelectionDAG &DAG) {
17461 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17461, __PRETTY_FUNCTION__))
;
17462 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17462, __PRETTY_FUNCTION__))
;
17463 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17463, __PRETTY_FUNCTION__))
;
17464
17465 if (V2.isUndef()) {
17466 // Use low duplicate instructions for masks that match their pattern.
17467 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17468 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17469
17470 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17471 // Non-half-crossing single input shuffles can be lowered with an
17472 // interleaved permutation.
17473 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17474 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17475 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17476 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17477 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17478 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17479 }
17480
17481 SmallVector<int, 4> RepeatedMask;
17482 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17483 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17484 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17485 }
17486
17487 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17488 V2, Subtarget, DAG))
17489 return Shuf128;
17490
17491 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
17492 return Unpck;
17493
17494 // Check if the blend happens to exactly fit that of SHUFPD.
17495 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17496 Zeroable, Subtarget, DAG))
17497 return Op;
17498
17499 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
17500 DAG, Subtarget))
17501 return V;
17502
17503 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17504 Zeroable, Subtarget, DAG))
17505 return Blend;
17506
17507 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17508}
17509
17510/// Handle lowering of 16-lane 32-bit floating point shuffles.
17511static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17512 const APInt &Zeroable, SDValue V1, SDValue V2,
17513 const X86Subtarget &Subtarget,
17514 SelectionDAG &DAG) {
17515 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17515, __PRETTY_FUNCTION__))
;
17516 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17516, __PRETTY_FUNCTION__))
;
17517 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17517, __PRETTY_FUNCTION__))
;
17518
17519 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17520 // options to efficiently lower the shuffle.
17521 SmallVector<int, 4> RepeatedMask;
17522 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17523 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17523, __PRETTY_FUNCTION__))
;
17524
17525 // Use even/odd duplicate instructions for masks that match their pattern.
17526 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17527 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17528 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17529 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17530
17531 if (V2.isUndef())
17532 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17533 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17534
17535 // Use dedicated unpack instructions for masks that match their pattern.
17536 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
17537 return V;
17538
17539 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17540 Zeroable, Subtarget, DAG))
17541 return Blend;
17542
17543 // Otherwise, fall back to a SHUFPS sequence.
17544 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17545 }
17546
17547 // Try to create an in-lane repeating shuffle mask and then shuffle the
17548 // results into the target lanes.
17549 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17550 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17551 return V;
17552
17553 // If we have a single input shuffle with different shuffle patterns in the
17554 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17555 if (V2.isUndef() &&
17556 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17557 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17558 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17559 }
17560
17561 // If we have AVX512F support, we can use VEXPAND.
17562 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
17563 V1, V2, DAG, Subtarget))
17564 return V;
17565
17566 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17567}
17568
17569/// Handle lowering of 8-lane 64-bit integer shuffles.
17570static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17571 const APInt &Zeroable, SDValue V1, SDValue V2,
17572 const X86Subtarget &Subtarget,
17573 SelectionDAG &DAG) {
17574 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17574, __PRETTY_FUNCTION__))
;
17575 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17575, __PRETTY_FUNCTION__))
;
17576 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17576, __PRETTY_FUNCTION__))
;
17577
17578 if (V2.isUndef()) {
17579 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17580 // can use lower latency instructions that will operate on all four
17581 // 128-bit lanes.
17582 SmallVector<int, 2> Repeated128Mask;
17583 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17584 SmallVector<int, 4> PSHUFDMask;
17585 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17586 return DAG.getBitcast(
17587 MVT::v8i64,
17588 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17589 DAG.getBitcast(MVT::v16i32, V1),
17590 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17591 }
17592
17593 SmallVector<int, 4> Repeated256Mask;
17594 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17595 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17596 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17597 }
17598
17599 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17600 V2, Subtarget, DAG))
17601 return Shuf128;
17602
17603 // Try to use shift instructions.
17604 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
17605 Zeroable, Subtarget, DAG))
17606 return Shift;
17607
17608 // Try to use VALIGN.
17609 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17610 Subtarget, DAG))
17611 return Rotate;
17612
17613 // Try to use PALIGNR.
17614 if (Subtarget.hasBWI())
17615 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17616 Subtarget, DAG))
17617 return Rotate;
17618
17619 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
17620 return Unpck;
17621
17622 // If we have AVX512F support, we can use VEXPAND.
17623 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
17624 DAG, Subtarget))
17625 return V;
17626
17627 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17628 Zeroable, Subtarget, DAG))
17629 return Blend;
17630
17631 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17632}
17633
17634/// Handle lowering of 16-lane 32-bit integer shuffles.
17635static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17636 const APInt &Zeroable, SDValue V1, SDValue V2,
17637 const X86Subtarget &Subtarget,
17638 SelectionDAG &DAG) {
17639 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17639, __PRETTY_FUNCTION__))
;
17640 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17640, __PRETTY_FUNCTION__))
;
17641 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17641, __PRETTY_FUNCTION__))
;
17642
17643 // Whenever we can lower this as a zext, that instruction is strictly faster
17644 // than any alternative. It also allows us to fold memory operands into the
17645 // shuffle in many cases.
17646 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17647 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17648 return ZExt;
17649
17650 // If the shuffle mask is repeated in each 128-bit lane we can use more
17651 // efficient instructions that mirror the shuffles across the four 128-bit
17652 // lanes.
17653 SmallVector<int, 4> RepeatedMask;
17654 bool Is128BitLaneRepeatedShuffle =
17655 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17656 if (Is128BitLaneRepeatedShuffle) {
17657 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17657, __PRETTY_FUNCTION__))
;
17658 if (V2.isUndef())
17659 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17660 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17661
17662 // Use dedicated unpack instructions for masks that match their pattern.
17663 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
17664 return V;
17665 }
17666
17667 // Try to use shift instructions.
17668 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
17669 Zeroable, Subtarget, DAG))
17670 return Shift;
17671
17672 // Try to use VALIGN.
17673 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17674 Subtarget, DAG))
17675 return Rotate;
17676
17677 // Try to use byte rotation instructions.
17678 if (Subtarget.hasBWI())
17679 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17680 Subtarget, DAG))
17681 return Rotate;
17682
17683 // Assume that a single SHUFPS is faster than using a permv shuffle.
17684 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17685 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17686 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17687 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17688 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17689 CastV1, CastV2, DAG);
17690 return DAG.getBitcast(MVT::v16i32, ShufPS);
17691 }
17692
17693 // Try to create an in-lane repeating shuffle mask and then shuffle the
17694 // results into the target lanes.
17695 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17696 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17697 return V;
17698
17699 // If we have AVX512F support, we can use VEXPAND.
17700 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17701 DAG, Subtarget))
17702 return V;
17703
17704 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17705 Zeroable, Subtarget, DAG))
17706 return Blend;
17707
17708 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17709}
17710
17711/// Handle lowering of 32-lane 16-bit integer shuffles.
17712static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17713 const APInt &Zeroable, SDValue V1, SDValue V2,
17714 const X86Subtarget &Subtarget,
17715 SelectionDAG &DAG) {
17716 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17716, __PRETTY_FUNCTION__))
;
17717 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17717, __PRETTY_FUNCTION__))
;
17718 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17718, __PRETTY_FUNCTION__))
;
17719 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17719, __PRETTY_FUNCTION__))
;
17720
17721 // Whenever we can lower this as a zext, that instruction is strictly faster
17722 // than any alternative. It also allows us to fold memory operands into the
17723 // shuffle in many cases.
17724 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17725 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17726 return ZExt;
17727
17728 // Use dedicated unpack instructions for masks that match their pattern.
17729 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17730 return V;
17731
17732 // Use dedicated pack instructions for masks that match their pattern.
17733 if (SDValue V =
17734 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17735 return V;
17736
17737 // Try to use shift instructions.
17738 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
17739 Zeroable, Subtarget, DAG))
17740 return Shift;
17741
17742 // Try to use byte rotation instructions.
17743 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17744 Subtarget, DAG))
17745 return Rotate;
17746
17747 if (V2.isUndef()) {
17748 // Try to use bit rotation instructions.
17749 if (SDValue Rotate =
17750 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17751 return Rotate;
17752
17753 SmallVector<int, 8> RepeatedMask;
17754 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17755 // As this is a single-input shuffle, the repeated mask should be
17756 // a strictly valid v8i16 mask that we can pass through to the v8i16
17757 // lowering to handle even the v32 case.
17758 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17759 RepeatedMask, Subtarget, DAG);
17760 }
17761 }
17762
17763 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17764 Zeroable, Subtarget, DAG))
17765 return Blend;
17766
17767 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17768 Zeroable, Subtarget, DAG))
17769 return PSHUFB;
17770
17771 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17772}
17773
17774/// Handle lowering of 64-lane 8-bit integer shuffles.
17775static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17776 const APInt &Zeroable, SDValue V1, SDValue V2,
17777 const X86Subtarget &Subtarget,
17778 SelectionDAG &DAG) {
17779 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17779, __PRETTY_FUNCTION__))
;
17780 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17780, __PRETTY_FUNCTION__))
;
17781 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")((Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17781, __PRETTY_FUNCTION__))
;
17782 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17782, __PRETTY_FUNCTION__))
;
17783
17784 // Whenever we can lower this as a zext, that instruction is strictly faster
17785 // than any alternative. It also allows us to fold memory operands into the
17786 // shuffle in many cases.
17787 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17788 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17789 return ZExt;
17790
17791 // Use dedicated unpack instructions for masks that match their pattern.
17792 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
17793 return V;
17794
17795 // Use dedicated pack instructions for masks that match their pattern.
17796 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
17797 Subtarget))
17798 return V;
17799
17800 // Try to use shift instructions.
17801 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
17802 Zeroable, Subtarget, DAG))
17803 return Shift;
17804
17805 // Try to use byte rotation instructions.
17806 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17807 Subtarget, DAG))
17808 return Rotate;
17809
17810 // Try to use bit rotation instructions.
17811 if (V2.isUndef())
17812 if (SDValue Rotate =
17813 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17814 return Rotate;
17815
17816 // Lower as AND if possible.
17817 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17818 Zeroable, Subtarget, DAG))
17819 return Masked;
17820
17821 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17822 Zeroable, Subtarget, DAG))
17823 return PSHUFB;
17824
17825 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17826 if (Subtarget.hasVBMI())
17827 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17828
17829 // Try to create an in-lane repeating shuffle mask and then shuffle the
17830 // results into the target lanes.
17831 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17832 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17833 return V;
17834
17835 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17836 Zeroable, Subtarget, DAG))
17837 return Blend;
17838
17839 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17840 // shuffle.
17841 if (!V2.isUndef())
17842 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17843 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17844 return Result;
17845
17846 // FIXME: Implement direct support for this type!
17847 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
17848}
17849
17850/// High-level routine to lower various 512-bit x86 vector shuffles.
17851///
17852/// This routine either breaks down the specific type of a 512-bit x86 vector
17853/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17854/// together based on the available instructions.
17855static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
17856 MVT VT, SDValue V1, SDValue V2,
17857 const APInt &Zeroable,
17858 const X86Subtarget &Subtarget,
17859 SelectionDAG &DAG) {
17860 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17861, __PRETTY_FUNCTION__))
17861 "Cannot lower 512-bit vectors w/ basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17861, __PRETTY_FUNCTION__))
;
17862
17863 // If we have a single input to the zero element, insert that into V1 if we
17864 // can do so cheaply.
17865 int NumElts = Mask.size();
17866 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17867
17868 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17869 if (SDValue Insertion = lowerShuffleAsElementInsertion(
17870 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17871 return Insertion;
17872
17873 // Handle special cases where the lower or upper half is UNDEF.
17874 if (SDValue V =
17875 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17876 return V;
17877
17878 // Check for being able to broadcast a single element.
17879 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17880 Subtarget, DAG))
17881 return Broadcast;
17882
17883 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17884 // Try using bit ops for masking and blending before falling back to
17885 // splitting.
17886 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17887 Subtarget, DAG))
17888 return V;
17889 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17890 return V;
17891
17892 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17893 }
17894
17895 // Dispatch to each element type for lowering. If we don't have support for
17896 // specific element type shuffles at 512 bits, immediately split them and
17897 // lower them. Each lowering routine of a given type is allowed to assume that
17898 // the requisite ISA extensions for that element type are available.
17899 switch (VT.SimpleTy) {
17900 case MVT::v8f64:
17901 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17902 case MVT::v16f32:
17903 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17904 case MVT::v8i64:
17905 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17906 case MVT::v16i32:
17907 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17908 case MVT::v32i16:
17909 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17910 case MVT::v64i8:
17911 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17912
17913 default:
17914 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17914)
;
17915 }
17916}
17917
17918static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
17919 MVT VT, SDValue V1, SDValue V2,
17920 const X86Subtarget &Subtarget,
17921 SelectionDAG &DAG) {
17922 // Shuffle should be unary.
17923 if (!V2.isUndef())
17924 return SDValue();
17925
17926 int ShiftAmt = -1;
17927 int NumElts = Mask.size();
17928 for (int i = 0; i != NumElts; ++i) {
17929 int M = Mask[i];
17930 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(((M == SM_SentinelUndef || (0 <= M && M < NumElts
)) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17931, __PRETTY_FUNCTION__))
17931 "Unexpected mask index.")(((M == SM_SentinelUndef || (0 <= M && M < NumElts
)) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17931, __PRETTY_FUNCTION__))
;
17932 if (M < 0)
17933 continue;
17934
17935 // The first non-undef element determines our shift amount.
17936 if (ShiftAmt < 0) {
17937 ShiftAmt = M - i;
17938 // Need to be shifting right.
17939 if (ShiftAmt <= 0)
17940 return SDValue();
17941 }
17942 // All non-undef elements must shift by the same amount.
17943 if (ShiftAmt != M - i)
17944 return SDValue();
17945 }
17946 assert(ShiftAmt >= 0 && "All undef?")((ShiftAmt >= 0 && "All undef?") ? static_cast<
void> (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17946, __PRETTY_FUNCTION__))
;
17947
17948 // Great we found a shift right.
17949 MVT WideVT = VT;
17950 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
17951 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
17952 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
17953 DAG.getUNDEF(WideVT), V1,
17954 DAG.getIntPtrConstant(0, DL));
17955 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
17956 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17957 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17958 DAG.getIntPtrConstant(0, DL));
17959}
17960
17961// Determine if this shuffle can be implemented with a KSHIFT instruction.
17962// Returns the shift amount if possible or -1 if not. This is a simplified
17963// version of matchShuffleAsShift.
17964static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17965 int MaskOffset, const APInt &Zeroable) {
17966 int Size = Mask.size();
17967
17968 auto CheckZeros = [&](int Shift, bool Left) {
17969 for (int j = 0; j < Shift; ++j)
17970 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17971 return false;
17972
17973 return true;
17974 };
17975
17976 auto MatchShift = [&](int Shift, bool Left) {
17977 unsigned Pos = Left ? Shift : 0;
17978 unsigned Low = Left ? 0 : Shift;
17979 unsigned Len = Size - Shift;
17980 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17981 };
17982
17983 for (int Shift = 1; Shift != Size; ++Shift)
17984 for (bool Left : {true, false})
17985 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17986 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
17987 return Shift;
17988 }
17989
17990 return -1;
17991}
17992
17993
17994// Lower vXi1 vector shuffles.
17995// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17996// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17997// vector, shuffle and then truncate it back.
17998static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
17999 MVT VT, SDValue V1, SDValue V2,
18000 const APInt &Zeroable,
18001 const X86Subtarget &Subtarget,
18002 SelectionDAG &DAG) {
18003 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18004, __PRETTY_FUNCTION__))
18004 "Cannot lower 512-bit vectors w/o basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18004, __PRETTY_FUNCTION__))
;
18005
18006 int NumElts = Mask.size();
18007
18008 // Try to recognize shuffles that are just padding a subvector with zeros.
18009 int SubvecElts = 0;
18010 int Src = -1;
18011 for (int i = 0; i != NumElts; ++i) {
18012 if (Mask[i] >= 0) {
18013 // Grab the source from the first valid mask. All subsequent elements need
18014 // to use this same source.
18015 if (Src < 0)
18016 Src = Mask[i] / NumElts;
18017 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18018 break;
18019 }
18020
18021 ++SubvecElts;
18022 }
18023 assert(SubvecElts != NumElts && "Identity shuffle?")((SubvecElts != NumElts && "Identity shuffle?") ? static_cast
<void> (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18023, __PRETTY_FUNCTION__))
;
18024
18025 // Clip to a power 2.
18026 SubvecElts = PowerOf2Floor(SubvecElts);
18027
18028 // Make sure the number of zeroable bits in the top at least covers the bits
18029 // not covered by the subvector.
18030 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18031 assert(Src >= 0 && "Expected a source!")((Src >= 0 && "Expected a source!") ? static_cast<
void> (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18031, __PRETTY_FUNCTION__))
;
18032 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18033 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18034 Src == 0 ? V1 : V2,
18035 DAG.getIntPtrConstant(0, DL));
18036 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18037 DAG.getConstant(0, DL, VT),
18038 Extract, DAG.getIntPtrConstant(0, DL));
18039 }
18040
18041 // Try a simple shift right with undef elements. Later we'll try with zeros.
18042 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18043 DAG))
18044 return Shift;
18045
18046 // Try to match KSHIFTs.
18047 unsigned Offset = 0;
18048 for (SDValue V : { V1, V2 }) {
18049 unsigned Opcode;
18050 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18051 if (ShiftAmt >= 0) {
18052 MVT WideVT = VT;
18053 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18054 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18055 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18056 DAG.getUNDEF(WideVT), V,
18057 DAG.getIntPtrConstant(0, DL));
18058 // Widened right shifts need two shifts to ensure we shift in zeroes.
18059 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18060 int WideElts = WideVT.getVectorNumElements();
18061 // Shift left to put the original vector in the MSBs of the new size.
18062 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18063 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18064 // Increase the shift amount to account for the left shift.
18065 ShiftAmt += WideElts - NumElts;
18066 }
18067
18068 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18069 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18070 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18071 DAG.getIntPtrConstant(0, DL));
18072 }
18073 Offset += NumElts; // Increment for next iteration.
18074 }
18075
18076
18077
18078 MVT ExtVT;
18079 switch (VT.SimpleTy) {
18080 default:
18081 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18081)
;
18082 case MVT::v2i1:
18083 ExtVT = MVT::v2i64;
18084 break;
18085 case MVT::v4i1:
18086 ExtVT = MVT::v4i32;
18087 break;
18088 case MVT::v8i1:
18089 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18090 // shuffle.
18091 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18092 break;
18093 case MVT::v16i1:
18094 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18095 // 256-bit operation available.
18096 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18097 break;
18098 case MVT::v32i1:
18099 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18100 // 256-bit operation available.
18101 assert(Subtarget.hasBWI() && "Expected AVX512BW support")((Subtarget.hasBWI() && "Expected AVX512BW support") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18101, __PRETTY_FUNCTION__))
;
18102 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18103 break;
18104 case MVT::v64i1:
18105 // Fall back to scalarization. FIXME: We can do better if the shuffle
18106 // can be partitioned cleanly.
18107 if (!Subtarget.useBWIRegs())
18108 return SDValue();
18109 ExtVT = MVT::v64i8;
18110 break;
18111 }
18112
18113 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18114 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18115
18116 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18117 // i1 was sign extended we can use X86ISD::CVT2MASK.
18118 int NumElems = VT.getVectorNumElements();
18119 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18120 (Subtarget.hasDQI() && (NumElems < 32)))
18121 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18122 Shuffle, ISD::SETGT);
18123
18124 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18125}
18126
18127/// Helper function that returns true if the shuffle mask should be
18128/// commuted to improve canonicalization.
18129static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
18130 int NumElements = Mask.size();
18131
18132 int NumV1Elements = 0, NumV2Elements = 0;
18133 for (int M : Mask)
18134 if (M < 0)
18135 continue;
18136 else if (M < NumElements)
18137 ++NumV1Elements;
18138 else
18139 ++NumV2Elements;
18140
18141 // Commute the shuffle as needed such that more elements come from V1 than
18142 // V2. This allows us to match the shuffle pattern strictly on how many
18143 // elements come from V1 without handling the symmetric cases.
18144 if (NumV2Elements > NumV1Elements)
18145 return true;
18146
18147 assert(NumV1Elements > 0 && "No V1 indices")((NumV1Elements > 0 && "No V1 indices") ? static_cast
<void> (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18147, __PRETTY_FUNCTION__))
;
18148
18149 if (NumV2Elements == 0)
18150 return false;
18151
18152 // When the number of V1 and V2 elements are the same, try to minimize the
18153 // number of uses of V2 in the low half of the vector. When that is tied,
18154 // ensure that the sum of indices for V1 is equal to or lower than the sum
18155 // indices for V2. When those are equal, try to ensure that the number of odd
18156 // indices for V1 is lower than the number of odd indices for V2.
18157 if (NumV1Elements == NumV2Elements) {
18158 int LowV1Elements = 0, LowV2Elements = 0;
18159 for (int M : Mask.slice(0, NumElements / 2))
18160 if (M >= NumElements)
18161 ++LowV2Elements;
18162 else if (M >= 0)
18163 ++LowV1Elements;
18164 if (LowV2Elements > LowV1Elements)
18165 return true;
18166 if (LowV2Elements == LowV1Elements) {
18167 int SumV1Indices = 0, SumV2Indices = 0;
18168 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18169 if (Mask[i] >= NumElements)
18170 SumV2Indices += i;
18171 else if (Mask[i] >= 0)
18172 SumV1Indices += i;
18173 if (SumV2Indices < SumV1Indices)
18174 return true;
18175 if (SumV2Indices == SumV1Indices) {
18176 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18177 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18178 if (Mask[i] >= NumElements)
18179 NumV2OddIndices += i % 2;
18180 else if (Mask[i] >= 0)
18181 NumV1OddIndices += i % 2;
18182 if (NumV2OddIndices < NumV1OddIndices)
18183 return true;
18184 }
18185 }
18186 }
18187
18188 return false;
18189}
18190
18191/// Top-level lowering for x86 vector shuffles.
18192///
18193/// This handles decomposition, canonicalization, and lowering of all x86
18194/// vector shuffles. Most of the specific lowering strategies are encapsulated
18195/// above in helper routines. The canonicalization attempts to widen shuffles
18196/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18197/// s.t. only one of the two inputs needs to be tested, etc.
18198static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
18199 SelectionDAG &DAG) {
18200 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18201 ArrayRef<int> OrigMask = SVOp->getMask();
18202 SDValue V1 = Op.getOperand(0);
18203 SDValue V2 = Op.getOperand(1);
18204 MVT VT = Op.getSimpleValueType();
18205 int NumElements = VT.getVectorNumElements();
18206 SDLoc DL(Op);
18207 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18208
18209 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18210, __PRETTY_FUNCTION__))
18210 "Can't lower MMX shuffles")(((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18210, __PRETTY_FUNCTION__))
;
18211
18212 bool V1IsUndef = V1.isUndef();
18213 bool V2IsUndef = V2.isUndef();
18214 if (V1IsUndef && V2IsUndef)
18215 return DAG.getUNDEF(VT);
18216
18217 // When we create a shuffle node we put the UNDEF node to second operand,
18218 // but in some cases the first operand may be transformed to UNDEF.
18219 // In this case we should just commute the node.
18220 if (V1IsUndef)
18221 return DAG.getCommutedVectorShuffle(*SVOp);
18222
18223 // Check for non-undef masks pointing at an undef vector and make the masks
18224 // undef as well. This makes it easier to match the shuffle based solely on
18225 // the mask.
18226 if (V2IsUndef &&
18227 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18228 SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
18229 for (int &M : NewMask)
18230 if (M >= NumElements)
18231 M = -1;
18232 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18233 }
18234
18235 // Check for illegal shuffle mask element index values.
18236 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18237 (void)MaskUpperLimit;
18238 assert(llvm::all_of(OrigMask,((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18240, __PRETTY_FUNCTION__))
18239 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18240, __PRETTY_FUNCTION__))
18240 "Out of bounds shuffle index")((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18240, __PRETTY_FUNCTION__))
;
18241
18242 // We actually see shuffles that are entirely re-arrangements of a set of
18243 // zero inputs. This mostly happens while decomposing complex shuffles into
18244 // simple ones. Directly lower these as a buildvector of zeros.
18245 APInt KnownUndef, KnownZero;
18246 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18247
18248 APInt Zeroable = KnownUndef | KnownZero;
18249 if (Zeroable.isAllOnesValue())
18250 return getZeroVector(VT, Subtarget, DAG, DL);
18251
18252 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18253
18254 // Try to collapse shuffles into using a vector type with fewer elements but
18255 // wider element types. We cap this to not form integers or floating point
18256 // elements wider than 64 bits, but it might be interesting to form i128
18257 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18258 SmallVector<int, 16> WidenedMask;
18259 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18260 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18261 // Shuffle mask widening should not interfere with a broadcast opportunity
18262 // by obfuscating the operands with bitcasts.
18263 // TODO: Avoid lowering directly from this top-level function: make this
18264 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18265 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18266 Subtarget, DAG))
18267 return Broadcast;
18268
18269 MVT NewEltVT = VT.isFloatingPoint()
18270 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
18271 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
18272 int NewNumElts = NumElements / 2;
18273 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18274 // Make sure that the new vector type is legal. For example, v2f64 isn't
18275 // legal on SSE1.
18276 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18277 if (V2IsZero) {
18278 // Modify the new Mask to take all zeros from the all-zero vector.
18279 // Choose indices that are blend-friendly.
18280 bool UsedZeroVector = false;
18281 assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&((find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!") ? static_cast<void>
(0) : __assert_fail ("find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18282, __PRETTY_FUNCTION__))
18282 "V2's non-undef elements are used?!")((find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!") ? static_cast<void>
(0) : __assert_fail ("find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18282, __PRETTY_FUNCTION__))
;
18283 for (int i = 0; i != NewNumElts; ++i)
18284 if (WidenedMask[i] == SM_SentinelZero) {
18285 WidenedMask[i] = i + NewNumElts;
18286 UsedZeroVector = true;
18287 }
18288 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18289 // some elements to be undef.
18290 if (UsedZeroVector)
18291 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18292 }
18293 V1 = DAG.getBitcast(NewVT, V1);
18294 V2 = DAG.getBitcast(NewVT, V2);
18295 return DAG.getBitcast(
18296 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18297 }
18298 }
18299
18300 // Commute the shuffle if it will improve canonicalization.
18301 SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
18302 if (canonicalizeShuffleMaskWithCommute(Mask)) {
18303 ShuffleVectorSDNode::commuteMask(Mask);
18304 std::swap(V1, V2);
18305 }
18306
18307 // For each vector width, delegate to a specialized lowering routine.
18308 if (VT.is128BitVector())
18309 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18310
18311 if (VT.is256BitVector())
18312 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18313
18314 if (VT.is512BitVector())
18315 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18316
18317 if (Is1BitVector)
18318 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18319
18320 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18320)
;
18321}
18322
18323/// Try to lower a VSELECT instruction to a vector shuffle.
18324static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
18325 const X86Subtarget &Subtarget,
18326 SelectionDAG &DAG) {
18327 SDValue Cond = Op.getOperand(0);
18328 SDValue LHS = Op.getOperand(1);
18329 SDValue RHS = Op.getOperand(2);
18330 MVT VT = Op.getSimpleValueType();
18331
18332 // Only non-legal VSELECTs reach this lowering, convert those into generic
18333 // shuffles and re-use the shuffle lowering path for blends.
18334 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
18335 SmallVector<int, 32> Mask;
18336 if (createShuffleMaskFromVSELECT(Mask, Cond))
18337 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18338 }
18339
18340 return SDValue();
18341}
18342
18343SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18344 SDValue Cond = Op.getOperand(0);
18345 SDValue LHS = Op.getOperand(1);
18346 SDValue RHS = Op.getOperand(2);
18347
18348 // A vselect where all conditions and data are constants can be optimized into
18349 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18350 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
18351 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
18352 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
18353 return SDValue();
18354
18355 // Try to lower this to a blend-style vector shuffle. This can handle all
18356 // constant condition cases.
18357 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18358 return BlendOp;
18359
18360 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18361 // with patterns on the mask registers on AVX-512.
18362 MVT CondVT = Cond.getSimpleValueType();
18363 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18364 if (CondEltSize == 1)
18365 return Op;
18366
18367 // Variable blends are only legal from SSE4.1 onward.
18368 if (!Subtarget.hasSSE41())
18369 return SDValue();
18370
18371 SDLoc dl(Op);
18372 MVT VT = Op.getSimpleValueType();
18373 unsigned EltSize = VT.getScalarSizeInBits();
18374 unsigned NumElts = VT.getVectorNumElements();
18375
18376 // Expand v32i16/v64i8 without BWI.
18377 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18378 return SDValue();
18379
18380 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18381 // into an i1 condition so that we can use the mask-based 512-bit blend
18382 // instructions.
18383 if (VT.getSizeInBits() == 512) {
18384 // Build a mask by testing the condition against zero.
18385 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18386 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18387 DAG.getConstant(0, dl, CondVT),
18388 ISD::SETNE);
18389 // Now return a new VSELECT using the mask.
18390 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18391 }
18392
18393 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18394 if (CondEltSize != EltSize) {
18395 // If we don't have a sign splat, rely on the expansion.
18396 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18397 return SDValue();
18398
18399 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18400 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18401 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18402 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18403 }
18404
18405 // Only some types will be legal on some subtargets. If we can emit a legal
18406 // VSELECT-matching blend, return Op, and but if we need to expand, return
18407 // a null value.
18408 switch (VT.SimpleTy) {
18409 default:
18410 // Most of the vector types have blends past SSE4.1.
18411 return Op;
18412
18413 case MVT::v32i8:
18414 // The byte blends for AVX vectors were introduced only in AVX2.
18415 if (Subtarget.hasAVX2())
18416 return Op;
18417
18418 return SDValue();
18419
18420 case MVT::v8i16:
18421 case MVT::v16i16: {
18422 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18423 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18424 Cond = DAG.getBitcast(CastVT, Cond);
18425 LHS = DAG.getBitcast(CastVT, LHS);
18426 RHS = DAG.getBitcast(CastVT, RHS);
18427 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18428 return DAG.getBitcast(VT, Select);
18429 }
18430 }
18431}
18432
18433static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
18434 MVT VT = Op.getSimpleValueType();
18435 SDValue Vec = Op.getOperand(0);
18436 SDValue Idx = Op.getOperand(1);
18437 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")((isa<ConstantSDNode>(Idx) && "Constant index expected"
) ? static_cast<void> (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18437, __PRETTY_FUNCTION__))
;
18438 SDLoc dl(Op);
18439
18440 if (!Vec.getSimpleValueType().is128BitVector())
18441 return SDValue();
18442
18443 if (VT.getSizeInBits() == 8) {
18444 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18445 // we're going to zero extend the register or fold the store.
18446 if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
18447 !MayFoldIntoStore(Op))
18448 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18449 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18450 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18451
18452 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
18453 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18454 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18455 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18456 }
18457
18458 if (VT == MVT::f32) {
18459 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18460 // the result back to FR32 register. It's only worth matching if the
18461 // result has a single use which is a store or a bitcast to i32. And in
18462 // the case of a store, it's not worth it if the index is a constant 0,
18463 // because a MOVSSmr can be used instead, which is smaller and faster.
18464 if (!Op.hasOneUse())
18465 return SDValue();
18466 SDNode *User = *Op.getNode()->use_begin();
18467 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18468 (User->getOpcode() != ISD::BITCAST ||
18469 User->getValueType(0) != MVT::i32))
18470 return SDValue();
18471 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18472 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18473 return DAG.getBitcast(MVT::f32, Extract);
18474 }
18475
18476 if (VT == MVT::i32 || VT == MVT::i64)
18477 return Op;
18478
18479 return SDValue();
18480}
18481
18482/// Extract one bit from mask vector, like v16i1 or v8i1.
18483/// AVX-512 feature.
18484static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
18485 const X86Subtarget &Subtarget) {
18486 SDValue Vec = Op.getOperand(0);
18487 SDLoc dl(Vec);
18488 MVT VecVT = Vec.getSimpleValueType();
18489 SDValue Idx = Op.getOperand(1);
18490 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18491 MVT EltVT = Op.getSimpleValueType();
18492
18493 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI(
)) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18494, __PRETTY_FUNCTION__))
18494 "Unexpected vector type in ExtractBitFromMaskVector")(((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI(
)) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18494, __PRETTY_FUNCTION__))
;
18495
18496 // variable index can't be handled in mask registers,
18497 // extend vector to VR512/128
18498 if (!IdxC) {
18499 unsigned NumElts = VecVT.getVectorNumElements();
18500 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18501 // than extending to 128/256bit.
18502 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18503 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18504 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18505 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18506 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18507 }
18508
18509 unsigned IdxVal = IdxC->getZExtValue();
18510 if (IdxVal == 0) // the operation is legal
18511 return Op;
18512
18513 // Extend to natively supported kshift.
18514 unsigned NumElems = VecVT.getVectorNumElements();
18515 MVT WideVecVT = VecVT;
18516 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
18517 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18518 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
18519 DAG.getUNDEF(WideVecVT), Vec,
18520 DAG.getIntPtrConstant(0, dl));
18521 }
18522
18523 // Use kshiftr instruction to move to the lower element.
18524 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
18525 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18526
18527 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18528 DAG.getIntPtrConstant(0, dl));
18529}
18530
18531SDValue
18532X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18533 SelectionDAG &DAG) const {
18534 SDLoc dl(Op);
18535 SDValue Vec = Op.getOperand(0);
18536 MVT VecVT = Vec.getSimpleValueType();
18537 SDValue Idx = Op.getOperand(1);
18538 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18539
18540 if (VecVT.getVectorElementType() == MVT::i1)
18541 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18542
18543 if (!IdxC) {
18544 // Its more profitable to go through memory (1 cycles throughput)
18545 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
18546 // IACA tool was used to get performance estimation
18547 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18548 //
18549 // example : extractelement <16 x i8> %a, i32 %i
18550 //
18551 // Block Throughput: 3.00 Cycles
18552 // Throughput Bottleneck: Port5
18553 //
18554 // | Num Of | Ports pressure in cycles | |
18555 // | Uops | 0 - DV | 5 | 6 | 7 | |
18556 // ---------------------------------------------
18557 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18558 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18559 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18560 // Total Num Of Uops: 4
18561 //
18562 //
18563 // Block Throughput: 1.00 Cycles
18564 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18565 //
18566 // | | Ports pressure in cycles | |
18567 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18568 // ---------------------------------------------------------
18569 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18570 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18571 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18572 // Total Num Of Uops: 4
18573
18574 return SDValue();
18575 }
18576
18577 unsigned IdxVal = IdxC->getZExtValue();
18578
18579 // If this is a 256-bit vector result, first extract the 128-bit vector and
18580 // then extract the element from the 128-bit vector.
18581 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18582 // Get the 128-bit vector.
18583 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18584 MVT EltVT = VecVT.getVectorElementType();
18585
18586 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18587 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18587, __PRETTY_FUNCTION__))
;
18588
18589 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18590 // this can be done with a mask.
18591 IdxVal &= ElemsPerChunk - 1;
18592 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18593 DAG.getIntPtrConstant(IdxVal, dl));
18594 }
18595
18596 assert(VecVT.is128BitVector() && "Unexpected vector length")((VecVT.is128BitVector() && "Unexpected vector length"
) ? static_cast<void> (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18596, __PRETTY_FUNCTION__))
;
18597
18598 MVT VT = Op.getSimpleValueType();
18599
18600 if (VT.getSizeInBits() == 16) {
18601 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18602 // we're going to zero extend the register or fold the store (SSE41 only).
18603 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
18604 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
18605 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18606 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18607 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18608
18609 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18610 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18611 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18612 }
18613
18614 if (Subtarget.hasSSE41())
18615 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18616 return Res;
18617
18618 // TODO: We only extract a single element from v16i8, we can probably afford
18619 // to be more aggressive here before using the default approach of spilling to
18620 // stack.
18621 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
18622 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18623 int DWordIdx = IdxVal / 4;
18624 if (DWordIdx == 0) {
18625 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18626 DAG.getBitcast(MVT::v4i32, Vec),
18627 DAG.getIntPtrConstant(DWordIdx, dl));
18628 int ShiftVal = (IdxVal % 4) * 8;
18629 if (ShiftVal != 0)
18630 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18631 DAG.getConstant(ShiftVal, dl, MVT::i8));
18632 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18633 }
18634
18635 int WordIdx = IdxVal / 2;
18636 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18637 DAG.getBitcast(MVT::v8i16, Vec),
18638 DAG.getIntPtrConstant(WordIdx, dl));
18639 int ShiftVal = (IdxVal % 2) * 8;
18640 if (ShiftVal != 0)
18641 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18642 DAG.getConstant(ShiftVal, dl, MVT::i8));
18643 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18644 }
18645
18646 if (VT.getSizeInBits() == 32) {
18647 if (IdxVal == 0)
18648 return Op;
18649
18650 // SHUFPS the element to the lowest double word, then movss.
18651 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
18652 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18653 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18654 DAG.getIntPtrConstant(0, dl));
18655 }
18656
18657 if (VT.getSizeInBits() == 64) {
18658 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18659 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18660 // to match extract_elt for f64.
18661 if (IdxVal == 0)
18662 return Op;
18663
18664 // UNPCKHPD the element to the lowest double word, then movsd.
18665 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18666 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18667 int Mask[2] = { 1, -1 };
18668 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18669 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18670 DAG.getIntPtrConstant(0, dl));
18671 }
18672
18673 return SDValue();
18674}
18675
18676/// Insert one bit to mask vector, like v16i1 or v8i1.
18677/// AVX-512 feature.
18678static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
18679 const X86Subtarget &Subtarget) {
18680 SDLoc dl(Op);
18681 SDValue Vec = Op.getOperand(0);
18682 SDValue Elt = Op.getOperand(1);
18683 SDValue Idx = Op.getOperand(2);
18684 MVT VecVT = Vec.getSimpleValueType();
18685
18686 if (!isa<ConstantSDNode>(Idx)) {
18687 // Non constant index. Extend source and destination,
18688 // insert element and then truncate the result.
18689 unsigned NumElts = VecVT.getVectorNumElements();
18690 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18691 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18692 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18693 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18694 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18695 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18696 }
18697
18698 // Copy into a k-register, extract to v1i1 and insert_subvector.
18699 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18700 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18701}
18702
18703SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18704 SelectionDAG &DAG) const {
18705 MVT VT = Op.getSimpleValueType();
18706 MVT EltVT = VT.getVectorElementType();
18707 unsigned NumElts = VT.getVectorNumElements();
18708
18709 if (EltVT == MVT::i1)
18710 return InsertBitToMaskVector(Op, DAG, Subtarget);
18711
18712 SDLoc dl(Op);
18713 SDValue N0 = Op.getOperand(0);
18714 SDValue N1 = Op.getOperand(1);
18715 SDValue N2 = Op.getOperand(2);
18716
18717 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18718 if (!N2C || N2C->getAPIntValue().uge(NumElts))
18719 return SDValue();
18720 uint64_t IdxVal = N2C->getZExtValue();
18721
18722 bool IsZeroElt = X86::isZeroNode(N1);
18723 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18724
18725 // If we are inserting a element, see if we can do this more efficiently with
18726 // a blend shuffle with a rematerializable vector than a costly integer
18727 // insertion.
18728 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
18729 16 <= EltVT.getSizeInBits()) {
18730 SmallVector<int, 8> BlendMask;
18731 for (unsigned i = 0; i != NumElts; ++i)
18732 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18733 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18734 : getOnesVector(VT, DAG, dl);
18735 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18736 }
18737
18738 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18739 // into that, and then insert the subvector back into the result.
18740 if (VT.is256BitVector() || VT.is512BitVector()) {
18741 // With a 256-bit vector, we can insert into the zero element efficiently
18742 // using a blend if we have AVX or AVX2 and the right data type.
18743 if (VT.is256BitVector() && IdxVal == 0) {
18744 // TODO: It is worthwhile to cast integer to floating point and back
18745 // and incur a domain crossing penalty if that's what we'll end up
18746 // doing anyway after extracting to a 128-bit vector.
18747 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18748 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
18749 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18750 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18751 DAG.getTargetConstant(1, dl, MVT::i8));
18752 }
18753 }
18754
18755 // Get the desired 128-bit vector chunk.
18756 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18757
18758 // Insert the element into the desired chunk.
18759 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
18760 assert(isPowerOf2_32(NumEltsIn128))((isPowerOf2_32(NumEltsIn128)) ? static_cast<void> (0) :
__assert_fail ("isPowerOf2_32(NumEltsIn128)", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18760, __PRETTY_FUNCTION__))
;
18761 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18762 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18763
18764 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18765 DAG.getIntPtrConstant(IdxIn128, dl));
18766
18767 // Insert the changed part back into the bigger vector
18768 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18769 }
18770 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")((VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18770, __PRETTY_FUNCTION__))
;
18771
18772 // This will be just movd/movq/movss/movsd.
18773 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18774 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18775 EltVT == MVT::i64) {
18776 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18777 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18778 }
18779
18780 // We can't directly insert an i8 or i16 into a vector, so zero extend
18781 // it to i32 first.
18782 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18783 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18784 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
18785 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18786 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18787 return DAG.getBitcast(VT, N1);
18788 }
18789 }
18790
18791 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18792 // argument. SSE41 required for pinsrb.
18793 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18794 unsigned Opc;
18795 if (VT == MVT::v8i16) {
18796 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")((Subtarget.hasSSE2() && "SSE2 required for PINSRW") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18796, __PRETTY_FUNCTION__))
;
18797 Opc = X86ISD::PINSRW;
18798 } else {
18799 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")((VT == MVT::v16i8 && "PINSRB requires v16i8 vector")
? static_cast<void> (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18799, __PRETTY_FUNCTION__))
;
18800 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")((Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18800, __PRETTY_FUNCTION__))
;
18801 Opc = X86ISD::PINSRB;
18802 }
18803
18804 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")((N1.getValueType() != MVT::i32 && "Unexpected VT") ?
static_cast<void> (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18804, __PRETTY_FUNCTION__))
;
18805 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18806 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18807 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18808 }
18809
18810 if (Subtarget.hasSSE41()) {
18811 if (EltVT == MVT::f32) {
18812 // Bits [7:6] of the constant are the source select. This will always be
18813 // zero here. The DAG Combiner may combine an extract_elt index into
18814 // these bits. For example (insert (extract, 3), 2) could be matched by
18815 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18816 // Bits [5:4] of the constant are the destination select. This is the
18817 // value of the incoming immediate.
18818 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18819 // combine either bitwise AND or insert of float 0.0 to set these bits.
18820
18821 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18822 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
18823 // If this is an insertion of 32-bits into the low 32-bits of
18824 // a vector, we prefer to generate a blend with immediate rather
18825 // than an insertps. Blends are simpler operations in hardware and so
18826 // will always have equal or better performance than insertps.
18827 // But if optimizing for size and there's a load folding opportunity,
18828 // generate insertps because blendps does not have a 32-bit memory
18829 // operand form.
18830 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18831 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18832 DAG.getTargetConstant(1, dl, MVT::i8));
18833 }
18834 // Create this as a scalar to vector..
18835 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18836 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18837 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18838 }
18839
18840 // PINSR* works with constant index.
18841 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18842 return Op;
18843 }
18844
18845 return SDValue();
18846}
18847
18848static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
18849 SelectionDAG &DAG) {
18850 SDLoc dl(Op);
18851 MVT OpVT = Op.getSimpleValueType();
18852
18853 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18854 // combines.
18855 if (X86::isZeroNode(Op.getOperand(0)))
18856 return getZeroVector(OpVT, Subtarget, DAG, dl);
18857
18858 // If this is a 256-bit vector result, first insert into a 128-bit
18859 // vector and then insert into the 256-bit vector.
18860 if (!OpVT.is128BitVector()) {
18861 // Insert into a 128-bit vector.
18862 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18863 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
18864 OpVT.getVectorNumElements() / SizeFactor);
18865
18866 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18867
18868 // Insert the 128-bit vector.
18869 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18870 }
18871 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&((OpVT.is128BitVector() && OpVT.isInteger() &&
OpVT != MVT::v2i64 && "Expected an SSE type!") ? static_cast
<void> (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18872, __PRETTY_FUNCTION__))
18872 "Expected an SSE type!")((OpVT.is128BitVector() && OpVT.isInteger() &&
OpVT != MVT::v2i64 && "Expected an SSE type!") ? static_cast
<void> (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18872, __PRETTY_FUNCTION__))
;
18873
18874 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
18875 if (OpVT == MVT::v4i32)
18876 return Op;
18877
18878 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18879 return DAG.getBitcast(
18880 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18881}
18882
18883// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18884// simple superregister reference or explicit instructions to insert
18885// the upper bits of a vector.
18886static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
18887 SelectionDAG &DAG) {
18888 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)((Op.getSimpleValueType().getVectorElementType() == MVT::i1) ?
static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18888, __PRETTY_FUNCTION__))
;
18889
18890 return insert1BitVector(Op, DAG, Subtarget);
18891}
18892
18893static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
18894 SelectionDAG &DAG) {
18895 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&((Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering") ? static_cast
<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18896, __PRETTY_FUNCTION__))
18896 "Only vXi1 extract_subvectors need custom lowering")((Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering") ? static_cast
<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18896, __PRETTY_FUNCTION__))
;
18897
18898 SDLoc dl(Op);
18899 SDValue Vec = Op.getOperand(0);
18900 uint64_t IdxVal = Op.getConstantOperandVal(1);
18901
18902 if (IdxVal == 0) // the operation is legal
18903 return Op;
18904
18905 MVT VecVT = Vec.getSimpleValueType();
18906 unsigned NumElems = VecVT.getVectorNumElements();
18907
18908 // Extend to natively supported kshift.
18909 MVT WideVecVT = VecVT;
18910 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
18911 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18912 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
18913 DAG.getUNDEF(WideVecVT), Vec,
18914 DAG.getIntPtrConstant(0, dl));
18915 }
18916
18917 // Shift to the LSB.
18918 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
18919 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18920
18921 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18922 DAG.getIntPtrConstant(0, dl));
18923}
18924
18925// Returns the appropriate wrapper opcode for a global reference.
18926unsigned X86TargetLowering::getGlobalWrapperKind(
18927 const GlobalValue *GV, const unsigned char OpFlags) const {
18928 // References to absolute symbols are never PC-relative.
18929 if (GV && GV->isAbsoluteSymbolRef())
18930 return X86ISD::Wrapper;
18931
18932 CodeModel::Model M = getTargetMachine().getCodeModel();
18933 if (Subtarget.isPICStyleRIPRel() &&
18934 (M == CodeModel::Small || M == CodeModel::Kernel))
18935 return X86ISD::WrapperRIP;
18936
18937 // GOTPCREL references must always use RIP.
18938 if (OpFlags == X86II::MO_GOTPCREL)
18939 return X86ISD::WrapperRIP;
18940
18941 return X86ISD::Wrapper;
18942}
18943
18944// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18945// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18946// one of the above mentioned nodes. It has to be wrapped because otherwise
18947// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18948// be used to form addressing mode. These wrapped nodes will be selected
18949// into MOV32ri.
18950SDValue
18951X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18952 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18953
18954 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18955 // global base reg.
18956 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18957
18958 auto PtrVT = getPointerTy(DAG.getDataLayout());
18959 SDValue Result = DAG.getTargetConstantPool(
18960 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18961 SDLoc DL(CP);
18962 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
18963 // With PIC, the address is actually $g + Offset.
18964 if (OpFlag) {
18965 Result =
18966 DAG.getNode(ISD::ADD, DL, PtrVT,
18967 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18968 }
18969
18970 return Result;
18971}
18972
18973SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18974 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18975
18976 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18977 // global base reg.
18978 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18979
18980 auto PtrVT = getPointerTy(DAG.getDataLayout());
18981 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18982 SDLoc DL(JT);
18983 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
18984
18985 // With PIC, the address is actually $g + Offset.
18986 if (OpFlag)
18987 Result =
18988 DAG.getNode(ISD::ADD, DL, PtrVT,
18989 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18990
18991 return Result;
18992}
18993
18994SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18995 SelectionDAG &DAG) const {
18996 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18997}
18998
18999SDValue
19000X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19001 // Create the TargetBlockAddressAddress node.
19002 unsigned char OpFlags =
19003 Subtarget.classifyBlockAddressReference();
19004 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19005 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19006 SDLoc dl(Op);
19007 auto PtrVT = getPointerTy(DAG.getDataLayout());
19008 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19009 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
19010
19011 // With PIC, the address is actually $g + Offset.
19012 if (isGlobalRelativeToPICBase(OpFlags)) {
19013 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19014 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19015 }
19016
19017 return Result;
19018}
19019
19020/// Creates target global address or external symbol nodes for calls or
19021/// other uses.
19022SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19023 bool ForCall) const {
19024 // Unpack the global address or external symbol.
19025 const SDLoc &dl = SDLoc(Op);
19026 const GlobalValue *GV = nullptr;
19027 int64_t Offset = 0;
19028 const char *ExternalSym = nullptr;
19029 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19030 GV = G->getGlobal();
19031 Offset = G->getOffset();
19032 } else {
19033 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19034 ExternalSym = ES->getSymbol();
19035 }
19036
19037 // Calculate some flags for address lowering.
19038 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
19039 unsigned char OpFlags;
19040 if (ForCall)
19041 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19042 else
19043 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19044 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19045 bool NeedsLoad = isGlobalStubReference(OpFlags);
19046
19047 CodeModel::Model M = DAG.getTarget().getCodeModel();
19048 auto PtrVT = getPointerTy(DAG.getDataLayout());
19049 SDValue Result;
19050
19051 if (GV) {
19052 // Create a target global address if this is a global. If possible, fold the
19053 // offset into the global address reference. Otherwise, ADD it on later.
19054 int64_t GlobalOffset = 0;
19055 if (OpFlags == X86II::MO_NO_FLAG &&
19056 X86::isOffsetSuitableForCodeModel(Offset, M)) {
19057 std::swap(GlobalOffset, Offset);
19058 }
19059 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19060 } else {
19061 // If this is not a global address, this must be an external symbol.
19062 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19063 }
19064
19065 // If this is a direct call, avoid the wrapper if we don't need to do any
19066 // loads or adds. This allows SDAG ISel to match direct calls.
19067 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19068 return Result;
19069
19070 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19071
19072 // With PIC, the address is actually $g + Offset.
19073 if (HasPICReg) {
19074 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19075 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19076 }
19077
19078 // For globals that require a load from a stub to get the address, emit the
19079 // load.
19080 if (NeedsLoad)
19081 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19082 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19083
19084 // If there was a non-zero offset that we didn't fold, create an explicit
19085 // addition for it.
19086 if (Offset != 0)
19087 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19088 DAG.getConstant(Offset, dl, PtrVT));
19089
19090 return Result;
19091}
19092
19093SDValue
19094X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19095 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19096}
19097
19098static SDValue
19099GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
19100 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
19101 unsigned char OperandFlags, bool LocalDynamic = false) {
19102 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19103 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19104 SDLoc dl(GA);
19105 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19106 GA->getValueType(0),
19107 GA->getOffset(),
19108 OperandFlags);
19109
19110 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
19111 : X86ISD::TLSADDR;
19112
19113 if (InFlag) {
19114 SDValue Ops[] = { Chain, TGA, *InFlag };
19115 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19116 } else {
19117 SDValue Ops[] = { Chain, TGA };
19118 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19119 }
19120
19121 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19122 MFI.setAdjustsStack(true);
19123 MFI.setHasCalls(true);
19124
19125 SDValue Flag = Chain.getValue(1);
19126 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
19127}
19128
19129// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19130static SDValue
19131LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19132 const EVT PtrVT) {
19133 SDValue InFlag;
19134 SDLoc dl(GA); // ? function entry point might be better
19135 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19136 DAG.getNode(X86ISD::GlobalBaseReg,
19137 SDLoc(), PtrVT), InFlag);
19138 InFlag = Chain.getValue(1);
19139
19140 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
19141}
19142
19143// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19144static SDValue
19145LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19146 const EVT PtrVT) {
19147 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19148 X86::RAX, X86II::MO_TLSGD);
19149}
19150
19151// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19152static SDValue
19153LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19154 const EVT PtrVT) {
19155 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19156 X86::EAX, X86II::MO_TLSGD);
19157}
19158
19159static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
19160 SelectionDAG &DAG, const EVT PtrVT,
19161 bool Is64Bit, bool Is64BitLP64) {
19162 SDLoc dl(GA);
19163
19164 // Get the start address of the TLS block for this module.
19165 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
19166 .getInfo<X86MachineFunctionInfo>();
19167 MFI->incNumLocalDynamicTLSAccesses();
19168
19169 SDValue Base;
19170 if (Is64Bit) {
19171 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19172 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
19173 X86II::MO_TLSLD, /*LocalDynamic=*/true);
19174 } else {
19175 SDValue InFlag;
19176 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19177 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
19178 InFlag = Chain.getValue(1);
19179 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
19180 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
19181 }
19182
19183 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19184 // of Base.
19185
19186 // Build x@dtpoff.
19187 unsigned char OperandFlags = X86II::MO_DTPOFF;
19188 unsigned WrapperKind = X86ISD::Wrapper;
19189 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19190 GA->getValueType(0),
19191 GA->getOffset(), OperandFlags);
19192 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19193
19194 // Add x@dtpoff with the base.
19195 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19196}
19197
19198// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19199static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19200 const EVT PtrVT, TLSModel::Model model,
19201 bool is64Bit, bool isPIC) {
19202 SDLoc dl(GA);
19203
19204 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19205 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
19206 is64Bit ? 257 : 256));
19207
19208 SDValue ThreadPointer =
19209 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19210 MachinePointerInfo(Ptr));
19211
19212 unsigned char OperandFlags = 0;
19213 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19214 // initialexec.
19215 unsigned WrapperKind = X86ISD::Wrapper;
19216 if (model == TLSModel::LocalExec) {
19217 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19218 } else if (model == TLSModel::InitialExec) {
19219 if (is64Bit) {
19220 OperandFlags = X86II::MO_GOTTPOFF;
19221 WrapperKind = X86ISD::WrapperRIP;
19222 } else {
19223 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19224 }
19225 } else {
19226 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19226)
;
19227 }
19228
19229 // emit "addl x@ntpoff,%eax" (local exec)
19230 // or "addl x@indntpoff,%eax" (initial exec)
19231 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19232 SDValue TGA =
19233 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19234 GA->getOffset(), OperandFlags);
19235 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19236
19237 if (model == TLSModel::InitialExec) {
19238 if (isPIC && !is64Bit) {
19239 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19240 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19241 Offset);
19242 }
19243
19244 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19245 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19246 }
19247
19248 // The address of the thread local variable is the add of the thread
19249 // pointer with the offset of the variable.
19250 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19251}
19252
19253SDValue
19254X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19255
19256 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19257
19258 if (DAG.getTarget().useEmulatedTLS())
19259 return LowerToTLSEmulatedModel(GA, DAG);
19260
19261 const GlobalValue *GV = GA->getGlobal();
19262 auto PtrVT = getPointerTy(DAG.getDataLayout());
19263 bool PositionIndependent = isPositionIndependent();
19264
19265 if (Subtarget.isTargetELF()) {
19266 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19267 switch (model) {
19268 case TLSModel::GeneralDynamic:
19269 if (Subtarget.is64Bit()) {
19270 if (Subtarget.isTarget64BitLP64())
19271 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19272 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19273 }
19274 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19275 case TLSModel::LocalDynamic:
19276 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19277 Subtarget.isTarget64BitLP64());
19278 case TLSModel::InitialExec:
19279 case TLSModel::LocalExec:
19280 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19281 PositionIndependent);
19282 }
19283 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19283)
;
19284 }
19285
19286 if (Subtarget.isTargetDarwin()) {
19287 // Darwin only has one model of TLS. Lower to that.
19288 unsigned char OpFlag = 0;
19289 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
19290 X86ISD::WrapperRIP : X86ISD::Wrapper;
19291
19292 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19293 // global base reg.
19294 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19295 if (PIC32)
19296 OpFlag = X86II::MO_TLVP_PIC_BASE;
19297 else
19298 OpFlag = X86II::MO_TLVP;
19299 SDLoc DL(Op);
19300 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19301 GA->getValueType(0),
19302 GA->getOffset(), OpFlag);
19303 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19304
19305 // With PIC32, the address is actually $g + Offset.
19306 if (PIC32)
19307 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19308 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19309 Offset);
19310
19311 // Lowering the machine isd will make sure everything is in the right
19312 // location.
19313 SDValue Chain = DAG.getEntryNode();
19314 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19315 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19316 SDValue Args[] = { Chain, Offset };
19317 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19318 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
19319 DAG.getIntPtrConstant(0, DL, true),
19320 Chain.getValue(1), DL);
19321
19322 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19323 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19324 MFI.setAdjustsStack(true);
19325
19326 // And our return value (tls address) is in the standard call return value
19327 // location.
19328 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19329 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19330 }
19331
19332 if (Subtarget.isOSWindows()) {
19333 // Just use the implicit TLS architecture
19334 // Need to generate something similar to:
19335 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19336 // ; from TEB
19337 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19338 // mov rcx, qword [rdx+rcx*8]
19339 // mov eax, .tls$:tlsvar
19340 // [rax+rcx] contains the address
19341 // Windows 64bit: gs:0x58
19342 // Windows 32bit: fs:__tls_array
19343
19344 SDLoc dl(GA);
19345 SDValue Chain = DAG.getEntryNode();
19346
19347 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19348 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19349 // use its literal value of 0x2C.
19350 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
19351 ? Type::getInt8PtrTy(*DAG.getContext(),
19352 256)
19353 : Type::getInt32PtrTy(*DAG.getContext(),
19354 257));
19355
19356 SDValue TlsArray = Subtarget.is64Bit()
19357 ? DAG.getIntPtrConstant(0x58, dl)
19358 : (Subtarget.isTargetWindowsGNU()
19359 ? DAG.getIntPtrConstant(0x2C, dl)
19360 : DAG.getExternalSymbol("_tls_array", PtrVT));
19361
19362 SDValue ThreadPointer =
19363 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19364
19365 SDValue res;
19366 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19367 res = ThreadPointer;
19368 } else {
19369 // Load the _tls_index variable
19370 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19371 if (Subtarget.is64Bit())
19372 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19373 MachinePointerInfo(), MVT::i32);
19374 else
19375 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19376
19377 const DataLayout &DL = DAG.getDataLayout();
19378 SDValue Scale =
19379 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19380 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19381
19382 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19383 }
19384
19385 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19386
19387 // Get the offset of start of .tls section
19388 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19389 GA->getValueType(0),
19390 GA->getOffset(), X86II::MO_SECREL);
19391 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19392
19393 // The address of the thread local variable is the add of the thread
19394 // pointer with the offset of the variable.
19395 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19396 }
19397
19398 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19398)
;
19399}
19400
19401/// Lower SRA_PARTS and friends, which return two i32 values
19402/// and take a 2 x i32 value to shift plus a shift amount.
19403/// TODO: Can this be moved to general expansion code?
19404static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
19405 assert(Op.getNumOperands() == 3 && "Not a double-shift!")((Op.getNumOperands() == 3 && "Not a double-shift!") ?
static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == 3 && \"Not a double-shift!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19405, __PRETTY_FUNCTION__))
;
19406 MVT VT = Op.getSimpleValueType();
19407 unsigned VTBits = VT.getSizeInBits();
19408 SDLoc dl(Op);
19409 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
19410 SDValue ShOpLo = Op.getOperand(0);
19411 SDValue ShOpHi = Op.getOperand(1);
19412 SDValue ShAmt = Op.getOperand(2);
19413 // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
19414 // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
19415 // during isel.
19416 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
19417 DAG.getConstant(VTBits - 1, dl, MVT::i8));
19418 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
19419 DAG.getConstant(VTBits - 1, dl, MVT::i8))
19420 : DAG.getConstant(0, dl, VT);
19421
19422 SDValue Tmp2, Tmp3;
19423 if (Op.getOpcode() == ISD::SHL_PARTS) {
19424 Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
19425 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
19426 } else {
19427 Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
19428 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
19429 }
19430
19431 // If the shift amount is larger or equal than the width of a part we can't
19432 // rely on the results of shld/shrd. Insert a test and select the appropriate
19433 // values for large shift amounts.
19434 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
19435 DAG.getConstant(VTBits, dl, MVT::i8));
19436 SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
19437 DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
19438
19439 SDValue Hi, Lo;
19440 if (Op.getOpcode() == ISD::SHL_PARTS) {
19441 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
19442 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
19443 } else {
19444 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
19445 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
19446 }
19447
19448 return DAG.getMergeValues({ Lo, Hi }, dl);
19449}
19450
19451static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
19452 SelectionDAG &DAG) {
19453 MVT VT = Op.getSimpleValueType();
19454 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR
) && "Unexpected funnel shift opcode!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19455, __PRETTY_FUNCTION__))
19455 "Unexpected funnel shift opcode!")(((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR
) && "Unexpected funnel shift opcode!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19455, __PRETTY_FUNCTION__))
;
19456
19457 SDLoc DL(Op);
19458 SDValue Op0 = Op.getOperand(0);
19459 SDValue Op1 = Op.getOperand(1);
19460 SDValue Amt = Op.getOperand(2);
19461
19462 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
19463
19464 if (VT.isVector()) {
19465 assert(Subtarget.hasVBMI2() && "Expected VBMI2")((Subtarget.hasVBMI2() && "Expected VBMI2") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasVBMI2() && \"Expected VBMI2\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19465, __PRETTY_FUNCTION__))
;
19466
19467 if (IsFSHR)
19468 std::swap(Op0, Op1);
19469
19470 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19471 if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
19472 Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
19473 Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
19474 }
19475
19476 SDValue Funnel;
19477 APInt APIntShiftAmt;
19478 MVT ResultVT = Op0.getSimpleValueType();
19479 if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
19480 uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
19481 Funnel =
19482 DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
19483 Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19484 } else {
19485 if (!Subtarget.hasVLX() && !VT.is512BitVector())
19486 Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
19487 Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
19488 ResultVT, Op0, Op1, Amt);
19489 }
19490 if (!Subtarget.hasVLX() && !VT.is512BitVector())
19491 Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
19492 return Funnel;
19493 }
19494 assert((((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT ==
MVT::i64) && "Unexpected funnel shift type!") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19496, __PRETTY_FUNCTION__))
19495 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT ==
MVT::i64) && "Unexpected funnel shift type!") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19496, __PRETTY_FUNCTION__))
19496 "Unexpected funnel shift type!")(((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT ==
MVT::i64) && "Unexpected funnel shift type!") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19496, __PRETTY_FUNCTION__))
;
19497
19498 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
19499 bool OptForSize = DAG.shouldOptForSize();
19500 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
19501
19502 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
19503 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
19504 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
19505 !isa<ConstantSDNode>(Amt)) {
19506 unsigned EltSizeInBits = VT.getScalarSizeInBits();
19507 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
19508 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
19509 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
19510 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
19511 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
19512 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
19513 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
19514 if (IsFSHR) {
19515 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
19516 } else {
19517 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
19518 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
19519 }
19520 return DAG.getZExtOrTrunc(Res, DL, VT);
19521 }
19522
19523 if (VT == MVT::i8 || ExpandFunnel)
19524 return SDValue();
19525
19526 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
19527 if (VT == MVT::i16) {
19528 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
19529 DAG.getConstant(15, DL, Amt.getValueType()));
19530 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
19531 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
19532 }
19533
19534 return Op;
19535}
19536
19537// Try to use a packed vector operation to handle i64 on 32-bit targets when
19538// AVX512DQ is enabled.
19539static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
19540 const X86Subtarget &Subtarget) {
19541 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19545, __PRETTY_FUNCTION__))
19542 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19545, __PRETTY_FUNCTION__))
19543 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19545, __PRETTY_FUNCTION__))
19544 Op.getOpcode() == ISD::UINT_TO_FP) &&(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19545, __PRETTY_FUNCTION__))
19545 "Unexpected opcode!")(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19545, __PRETTY_FUNCTION__))
;
19546 bool IsStrict = Op->isStrictFPOpcode();
19547 unsigned OpNo = IsStrict ? 1 : 0;
19548 SDValue Src = Op.getOperand(OpNo);
19549 MVT SrcVT = Src.getSimpleValueType();
19550 MVT VT = Op.getSimpleValueType();
19551
19552 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19553 (VT != MVT::f32 && VT != MVT::f64))
19554 return SDValue();
19555
19556 // Pack the i64 into a vector, do the operation and extract.
19557
19558 // Using 256-bit to ensure result is 128-bits for f32 case.
19559 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19560 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19561 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19562
19563 SDLoc dl(Op);
19564 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19565 if (IsStrict) {
19566 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19567 {Op.getOperand(0), InVec});
19568 SDValue Chain = CvtVec.getValue(1);
19569 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19570 DAG.getIntPtrConstant(0, dl));
19571 return DAG.getMergeValues({Value, Chain}, dl);
19572 }
19573
19574 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19575
19576 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19577 DAG.getIntPtrConstant(0, dl));
19578}
19579
19580static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19581 const X86Subtarget &Subtarget) {
19582 switch (Opcode) {
19583 case ISD::SINT_TO_FP:
19584 // TODO: Handle wider types with AVX/AVX512.
19585 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19586 return false;
19587 // CVTDQ2PS or (V)CVTDQ2PD
19588 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19589
19590 case ISD::UINT_TO_FP:
19591 // TODO: Handle wider types and i64 elements.
19592 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19593 return false;
19594 // VCVTUDQ2PS or VCVTUDQ2PD
19595 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19596
19597 default:
19598 return false;
19599 }
19600}
19601
19602/// Given a scalar cast operation that is extracted from a vector, try to
19603/// vectorize the cast op followed by extraction. This will avoid an expensive
19604/// round-trip between XMM and GPR.
19605static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
19606 const X86Subtarget &Subtarget) {
19607 // TODO: This could be enhanced to handle smaller integer types by peeking
19608 // through an extend.
19609 SDValue Extract = Cast.getOperand(0);
19610 MVT DestVT = Cast.getSimpleValueType();
19611 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19612 !isa<ConstantSDNode>(Extract.getOperand(1)))
19613 return SDValue();
19614
19615 // See if we have a 128-bit vector cast op for this type of cast.
19616 SDValue VecOp = Extract.getOperand(0);
19617 MVT FromVT = VecOp.getSimpleValueType();
19618 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19619 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19620 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19621 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19622 return SDValue();
19623
19624 // If we are extracting from a non-zero element, first shuffle the source
19625 // vector to allow extracting from element zero.
19626 SDLoc DL(Cast);
19627 if (!isNullConstant(Extract.getOperand(1))) {
19628 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19629 Mask[0] = Extract.getConstantOperandVal(1);
19630 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19631 }
19632 // If the source vector is wider than 128-bits, extract the low part. Do not
19633 // create an unnecessarily wide vector cast op.
19634 if (FromVT != Vec128VT)
19635 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19636
19637 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19638 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19639 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19640 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19641 DAG.getIntPtrConstant(0, DL));
19642}
19643
19644/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19645/// try to vectorize the cast ops. This will avoid an expensive round-trip
19646/// between XMM and GPR.
19647static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
19648 const X86Subtarget &Subtarget) {
19649 // TODO: Allow FP_TO_UINT.
19650 SDValue CastToInt = CastToFP.getOperand(0);
19651 MVT VT = CastToFP.getSimpleValueType();
19652 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19653 return SDValue();
19654
19655 MVT IntVT = CastToInt.getSimpleValueType();
19656 SDValue X = CastToInt.getOperand(0);
19657 MVT SrcVT = X.getSimpleValueType();
19658 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19659 return SDValue();
19660
19661 // See if we have 128-bit vector cast instructions for this type of cast.
19662 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19663 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19664 IntVT != MVT::i32)
19665 return SDValue();
19666
19667 unsigned SrcSize = SrcVT.getSizeInBits();
19668 unsigned IntSize = IntVT.getSizeInBits();
19669 unsigned VTSize = VT.getSizeInBits();
19670 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19671 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19672 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19673
19674 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19675 unsigned ToIntOpcode =
19676 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19677 unsigned ToFPOpcode =
19678 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19679
19680 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19681 //
19682 // We are not defining the high elements (for example, zero them) because
19683 // that could nullify any performance advantage that we hoped to gain from
19684 // this vector op hack. We do not expect any adverse effects (like denorm
19685 // penalties) with cast ops.
19686 SDLoc DL(CastToFP);
19687 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19688 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19689 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19690 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19691 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19692}
19693
19694static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
19695 const X86Subtarget &Subtarget) {
19696 SDLoc DL(Op);
19697 bool IsStrict = Op->isStrictFPOpcode();
19698 MVT VT = Op->getSimpleValueType(0);
19699 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19700
19701 if (Subtarget.hasDQI()) {
19702 assert(!Subtarget.hasVLX() && "Unexpected features")((!Subtarget.hasVLX() && "Unexpected features") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19702, __PRETTY_FUNCTION__))
;
19703
19704 assert((Src.getSimpleValueType() == MVT::v2i64 ||(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19706, __PRETTY_FUNCTION__))
19705 Src.getSimpleValueType() == MVT::v4i64) &&(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19706, __PRETTY_FUNCTION__))
19706 "Unsupported custom type")(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19706, __PRETTY_FUNCTION__))
;
19707
19708 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19709 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19710, __PRETTY_FUNCTION__))
19710 "Unexpected VT!")(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19710, __PRETTY_FUNCTION__))
;
19711 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19712
19713 // Need to concat with zero vector for strict fp to avoid spurious
19714 // exceptions.
19715 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19716 : DAG.getUNDEF(MVT::v8i64);
19717 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19718 DAG.getIntPtrConstant(0, DL));
19719 SDValue Res, Chain;
19720 if (IsStrict) {
19721 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19722 {Op->getOperand(0), Src});
19723 Chain = Res.getValue(1);
19724 } else {
19725 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19726 }
19727
19728 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19729 DAG.getIntPtrConstant(0, DL));
19730
19731 if (IsStrict)
19732 return DAG.getMergeValues({Res, Chain}, DL);
19733 return Res;
19734 }
19735
19736 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19737 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19738 if (VT != MVT::v4f32 || IsSigned)
19739 return SDValue();
19740
19741 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19742 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19743 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19744 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19745 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19746 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19747 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19748 SmallVector<SDValue, 4> SignCvts(4);
19749 SmallVector<SDValue, 4> Chains(4);
19750 for (int i = 0; i != 4; ++i) {
19751 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19752 DAG.getIntPtrConstant(i, DL));
19753 if (IsStrict) {
19754 SignCvts[i] =
19755 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19756 {Op.getOperand(0), Elt});
19757 Chains[i] = SignCvts[i].getValue(1);
19758 } else {
19759 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19760 }
19761 }
19762 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19763
19764 SDValue Slow, Chain;
19765 if (IsStrict) {
19766 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19767 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19768 {Chain, SignCvt, SignCvt});
19769 Chain = Slow.getValue(1);
19770 } else {
19771 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19772 }
19773
19774 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19775 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19776
19777 if (IsStrict)
19778 return DAG.getMergeValues({Cvt, Chain}, DL);
19779
19780 return Cvt;
19781}
19782
19783SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19784 SelectionDAG &DAG) const {
19785 bool IsStrict = Op->isStrictFPOpcode();
19786 unsigned OpNo = IsStrict ? 1 : 0;
19787 SDValue Src = Op.getOperand(OpNo);
19788 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19789 MVT SrcVT = Src.getSimpleValueType();
19790 MVT VT = Op.getSimpleValueType();
19791 SDLoc dl(Op);
19792
19793 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
19794 return Extract;
19795
19796 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
19797 return R;
19798
19799 if (SrcVT.isVector()) {
19800 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19801 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19802 // source for strict FP.
19803 if (IsStrict)
19804 return DAG.getNode(
19805 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19806 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19807 DAG.getUNDEF(SrcVT))});
19808 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19809 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19810 DAG.getUNDEF(SrcVT)));
19811 }
19812 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19813 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
19814
19815 return SDValue();
19816 }
19817
19818 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19819, __PRETTY_FUNCTION__))
19819 "Unknown SINT_TO_FP to lower!")((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19819, __PRETTY_FUNCTION__))
;
19820
19821 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19822
19823 // These are really Legal; return the operand so the caller accepts it as
19824 // Legal.
19825 if (SrcVT == MVT::i32 && UseSSEReg)
19826 return Op;
19827 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19828 return Op;
19829
19830 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
19831 return V;
19832
19833 // SSE doesn't have an i16 conversion so we need to promote.
19834 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19835 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19836 if (IsStrict)
19837 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19838 {Chain, Ext});
19839
19840 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19841 }
19842
19843 if (VT == MVT::f128)
19844 return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
19845
19846 SDValue ValueToStore = Src;
19847 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19848 // Bitcasting to f64 here allows us to do a single 64-bit store from
19849 // an SSE register, avoiding the store forwarding penalty that would come
19850 // with two 32-bit stores.
19851 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19852
19853 unsigned Size = SrcVT.getStoreSize();
19854 Align Alignment(Size);
19855 MachineFunction &MF = DAG.getMachineFunction();
19856 auto PtrVT = getPointerTy(MF.getDataLayout());
19857 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19858 MachinePointerInfo MPI =
19859 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
19860 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19861 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19862 std::pair<SDValue, SDValue> Tmp =
19863 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19864
19865 if (IsStrict)
19866 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19867
19868 return Tmp.first;
19869}
19870
19871std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19872 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19873 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19874 // Build the FILD
19875 SDVTList Tys;
19876 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19877 if (useSSE)
19878 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19879 else
19880 Tys = DAG.getVTList(DstVT, MVT::Other);
19881
19882 SDValue FILDOps[] = {Chain, Pointer};
19883 SDValue Result =
19884 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19885 Alignment, MachineMemOperand::MOLoad);
19886 Chain = Result.getValue(1);
19887
19888 if (useSSE) {
19889 MachineFunction &MF = DAG.getMachineFunction();
19890 unsigned SSFISize = DstVT.getStoreSize();
19891 int SSFI =
19892 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19893 auto PtrVT = getPointerTy(MF.getDataLayout());
19894 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19895 Tys = DAG.getVTList(MVT::Other);
19896 SDValue FSTOps[] = {Chain, Result, StackSlot};
19897 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
19898 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
19899 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19900
19901 Chain =
19902 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19903 Result = DAG.getLoad(
19904 DstVT, DL, Chain, StackSlot,
19905 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
19906 Chain = Result.getValue(1);
19907 }
19908
19909 return { Result, Chain };
19910}
19911
19912/// Horizontal vector math instructions may be slower than normal math with
19913/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19914/// implementation, and likely shuffle complexity of the alternate sequence.
19915static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19916 const X86Subtarget &Subtarget) {
19917 bool IsOptimizingSize = DAG.shouldOptForSize();
19918 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19919 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19920}
19921
19922/// 64-bit unsigned integer to double expansion.
19923static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
19924 const X86Subtarget &Subtarget) {
19925 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19926 // when converting 0 when rounding toward negative infinity. Caller will
19927 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19928 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")((!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!"
) ? static_cast<void> (0) : __assert_fail ("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19928, __PRETTY_FUNCTION__))
;
19929 // This algorithm is not obvious. Here it is what we're trying to output:
19930 /*
19931 movq %rax, %xmm0
19932 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19933 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19934 #ifdef __SSE3__
19935 haddpd %xmm0, %xmm0
19936 #else
19937 pshufd $0x4e, %xmm0, %xmm1
19938 addpd %xmm1, %xmm0
19939 #endif
19940 */
19941
19942 SDLoc dl(Op);
19943 LLVMContext *Context = DAG.getContext();
19944
19945 // Build some magic constants.
19946 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19947 Constant *C0 = ConstantDataVector::get(*Context, CV0);
19948 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19949 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19950
19951 SmallVector<Constant*,2> CV1;
19952 CV1.push_back(
19953 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19954 APInt(64, 0x4330000000000000ULL))));
19955 CV1.push_back(
19956 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19957 APInt(64, 0x4530000000000000ULL))));
19958 Constant *C1 = ConstantVector::get(CV1);
19959 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19960
19961 // Load the 64-bit value into an XMM register.
19962 SDValue XR1 =
19963 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19964 SDValue CLod0 = DAG.getLoad(
19965 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19966 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
19967 SDValue Unpck1 =
19968 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19969
19970 SDValue CLod1 = DAG.getLoad(
19971 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19972 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
19973 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19974 // TODO: Are there any fast-math-flags to propagate here?
19975 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19976 SDValue Result;
19977
19978 if (Subtarget.hasSSE3() &&
19979 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19980 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19981 } else {
19982 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19983 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19984 }
19985 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19986 DAG.getIntPtrConstant(0, dl));
19987 return Result;
19988}
19989
19990/// 32-bit unsigned integer to float expansion.
19991static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
19992 const X86Subtarget &Subtarget) {
19993 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19994 SDLoc dl(Op);
19995 // FP constant to bias correct the final result.
19996 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
19997 MVT::f64);
19998
19999 // Load the 32-bit value into an XMM register.
20000 SDValue Load =
20001 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20002
20003 // Zero out the upper parts of the register.
20004 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20005
20006 // Or the load with the bias.
20007 SDValue Or = DAG.getNode(
20008 ISD::OR, dl, MVT::v2i64,
20009 DAG.getBitcast(MVT::v2i64, Load),
20010 DAG.getBitcast(MVT::v2i64,
20011 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20012 Or =
20013 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20014 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20015
20016 if (Op.getNode()->isStrictFPOpcode()) {
20017 // Subtract the bias.
20018 // TODO: Are there any fast-math-flags to propagate here?
20019 SDValue Chain = Op.getOperand(0);
20020 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20021 {Chain, Or, Bias});
20022
20023 if (Op.getValueType() == Sub.getValueType())
20024 return Sub;
20025
20026 // Handle final rounding.
20027 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20028 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20029
20030 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20031 }
20032
20033 // Subtract the bias.
20034 // TODO: Are there any fast-math-flags to propagate here?
20035 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20036
20037 // Handle final rounding.
20038 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20039}
20040
20041static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20042 const X86Subtarget &Subtarget,
20043 const SDLoc &DL) {
20044 if (Op.getSimpleValueType() != MVT::v2f64)
20045 return SDValue();
20046
20047 bool IsStrict = Op->isStrictFPOpcode();
20048
20049 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20050 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")((N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20050, __PRETTY_FUNCTION__))
;
20051
20052 if (Subtarget.hasAVX512()) {
20053 if (!Subtarget.hasVLX()) {
20054 // Let generic type legalization widen this.
20055 if (!IsStrict)
20056 return SDValue();
20057 // Otherwise pad the integer input with 0s and widen the operation.
20058 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20059 DAG.getConstant(0, DL, MVT::v2i32));
20060 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20061 {Op.getOperand(0), N0});
20062 SDValue Chain = Res.getValue(1);
20063 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20064 DAG.getIntPtrConstant(0, DL));
20065 return DAG.getMergeValues({Res, Chain}, DL);
20066 }
20067
20068 // Legalize to v4i32 type.
20069 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20070 DAG.getUNDEF(MVT::v2i32));
20071 if (IsStrict)
20072 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20073 {Op.getOperand(0), N0});
20074 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20075 }
20076
20077 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20078 // This gives us the floating point equivalent of 2^52 + the i32 integer
20079 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20080 // point leaving just our i32 integers in double format.
20081 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20082 SDValue VBias =
20083 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
20084 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20085 DAG.getBitcast(MVT::v2i64, VBias));
20086 Or = DAG.getBitcast(MVT::v2f64, Or);
20087
20088 if (IsStrict)
20089 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20090 {Op.getOperand(0), Or, VBias});
20091 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20092}
20093
20094static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
20095 const X86Subtarget &Subtarget) {
20096 SDLoc DL(Op);
20097 bool IsStrict = Op->isStrictFPOpcode();
20098 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20099 MVT VecIntVT = V.getSimpleValueType();
20100 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20101, __PRETTY_FUNCTION__))
20101 "Unsupported custom type")(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20101, __PRETTY_FUNCTION__))
;
20102
20103 if (Subtarget.hasAVX512()) {
20104 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20105 assert(!Subtarget.hasVLX() && "Unexpected features")((!Subtarget.hasVLX() && "Unexpected features") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20105, __PRETTY_FUNCTION__))
;
20106 MVT VT = Op->getSimpleValueType(0);
20107
20108 // v8i32->v8f64 is legal with AVX512 so just return it.
20109 if (VT == MVT::v8f64)
20110 return Op;
20111
20112 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20113, __PRETTY_FUNCTION__))
20113 "Unexpected VT!")(((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20113, __PRETTY_FUNCTION__))
;
20114 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20115 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20116 // Need to concat with zero vector for strict fp to avoid spurious
20117 // exceptions.
20118 SDValue Tmp =
20119 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20120 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20121 DAG.getIntPtrConstant(0, DL));
20122 SDValue Res, Chain;
20123 if (IsStrict) {
20124 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20125 {Op->getOperand(0), V});
20126 Chain = Res.getValue(1);
20127 } else {
20128 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20129 }
20130
20131 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20132 DAG.getIntPtrConstant(0, DL));
20133
20134 if (IsStrict)
20135 return DAG.getMergeValues({Res, Chain}, DL);
20136 return Res;
20137 }
20138
20139 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20140 Op->getSimpleValueType(0) == MVT::v4f64) {
20141 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20142 Constant *Bias = ConstantFP::get(
20143 *DAG.getContext(),
20144 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20145 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20146 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20147 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20148 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20149 SDValue VBias = DAG.getMemIntrinsicNode(
20150 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20151 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
20152 MachineMemOperand::MOLoad);
20153
20154 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20155 DAG.getBitcast(MVT::v4i64, VBias));
20156 Or = DAG.getBitcast(MVT::v4f64, Or);
20157
20158 if (IsStrict)
20159 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20160 {Op.getOperand(0), Or, VBias});
20161 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20162 }
20163
20164 // The algorithm is the following:
20165 // #ifdef __SSE4_1__
20166 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20167 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20168 // (uint4) 0x53000000, 0xaa);
20169 // #else
20170 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20171 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20172 // #endif
20173 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20174 // return (float4) lo + fhi;
20175
20176 bool Is128 = VecIntVT == MVT::v4i32;
20177 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20178 // If we convert to something else than the supported type, e.g., to v4f64,
20179 // abort early.
20180 if (VecFloatVT != Op->getSimpleValueType(0))
20181 return SDValue();
20182
20183 // In the #idef/#else code, we have in common:
20184 // - The vector of constants:
20185 // -- 0x4b000000
20186 // -- 0x53000000
20187 // - A shift:
20188 // -- v >> 16
20189
20190 // Create the splat vector for 0x4b000000.
20191 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20192 // Create the splat vector for 0x53000000.
20193 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20194
20195 // Create the right shift.
20196 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20197 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20198
20199 SDValue Low, High;
20200 if (Subtarget.hasSSE41()) {
20201 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20202 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20203 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20204 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20205 // Low will be bitcasted right away, so do not bother bitcasting back to its
20206 // original type.
20207 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20208 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20209 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20210 // (uint4) 0x53000000, 0xaa);
20211 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20212 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20213 // High will be bitcasted right away, so do not bother bitcasting back to
20214 // its original type.
20215 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20216 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20217 } else {
20218 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20219 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20220 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20221 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20222
20223 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20224 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20225 }
20226
20227 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20228 SDValue VecCstFSub = DAG.getConstantFP(
20229 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20230
20231 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20232 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20233 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20234 // enabled. See PR24512.
20235 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20236 // TODO: Are there any fast-math-flags to propagate here?
20237 // (float4) lo;
20238 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20239 // return (float4) lo + fhi;
20240 if (IsStrict) {
20241 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20242 {Op.getOperand(0), HighBitcast, VecCstFSub});
20243 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20244 {FHigh.getValue(1), LowBitcast, FHigh});
20245 }
20246
20247 SDValue FHigh =
20248 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20249 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20250}
20251
20252static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
20253 const X86Subtarget &Subtarget) {
20254 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20255 SDValue N0 = Op.getOperand(OpNo);
20256 MVT SrcVT = N0.getSimpleValueType();
20257 SDLoc dl(Op);
20258
20259 switch (SrcVT.SimpleTy) {
20260 default:
20261 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20261)
;
20262 case MVT::v2i32:
20263 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
20264 case MVT::v4i32:
20265 case MVT::v8i32:
20266 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
20267 case MVT::v2i64:
20268 case MVT::v4i64:
20269 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20270 }
20271}
20272
20273SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20274 SelectionDAG &DAG) const {
20275 bool IsStrict = Op->isStrictFPOpcode();
20276 unsigned OpNo = IsStrict ? 1 : 0;
20277 SDValue Src = Op.getOperand(OpNo);
20278 SDLoc dl(Op);
20279 auto PtrVT = getPointerTy(DAG.getDataLayout());
20280 MVT SrcVT = Src.getSimpleValueType();
20281 MVT DstVT = Op->getSimpleValueType(0);
20282 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20283
20284 if (DstVT == MVT::f128)
20285 return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
20286
20287 if (DstVT.isVector())
20288 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
20289
20290 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20291 return Extract;
20292
20293 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20294 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20295 // Conversions from unsigned i32 to f32/f64 are legal,
20296 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20297 return Op;
20298 }
20299
20300 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20301 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20302 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20303 if (IsStrict)
20304 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20305 {Chain, Src});
20306 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20307 }
20308
20309 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20310 return V;
20311
20312 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20313 // infinity. It produces -0.0, so disable under strictfp.
20314 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
20315 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
20316 if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
20317 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
20318 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20319 (DstVT == MVT::f32 || DstVT == MVT::f64))
20320 return SDValue();
20321
20322 // Make a 64-bit buffer, and use it to build an FILD.
20323 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20324 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20325 Align SlotAlign(8);
20326 MachinePointerInfo MPI =
20327 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20328 if (SrcVT == MVT::i32) {
20329 SDValue OffsetSlot =
20330 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
20331 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20332 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20333 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20334 std::pair<SDValue, SDValue> Tmp =
20335 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20336 if (IsStrict)
20337 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20338
20339 return Tmp.first;
20340 }
20341
20342 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")((SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20342, __PRETTY_FUNCTION__))
;
20343 SDValue ValueToStore = Src;
20344 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20345 // Bitcasting to f64 here allows us to do a single 64-bit store from
20346 // an SSE register, avoiding the store forwarding penalty that would come
20347 // with two 32-bit stores.
20348 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20349 }
20350 SDValue Store =
20351 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20352 // For i64 source, we need to add the appropriate power of 2 if the input
20353 // was negative. We must be careful to do the computation in x87 extended
20354 // precision, not in SSE.
20355 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20356 SDValue Ops[] = { Store, StackSlot };
20357 SDValue Fild =
20358 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20359 SlotAlign, MachineMemOperand::MOLoad);
20360 Chain = Fild.getValue(1);
20361
20362
20363 // Check whether the sign bit is set.
20364 SDValue SignSet = DAG.getSetCC(
20365 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20366 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20367
20368 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20369 APInt FF(64, 0x5F80000000000000ULL);
20370 SDValue FudgePtr = DAG.getConstantPool(
20371 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20372 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20373
20374 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20375 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20376 SDValue Four = DAG.getIntPtrConstant(4, dl);
20377 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20378 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20379
20380 // Load the value out, extending it from f32 to f80.
20381 SDValue Fudge = DAG.getExtLoad(
20382 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20383 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
20384 CPAlignment);
20385 Chain = Fudge.getValue(1);
20386 // Extend everything to 80 bits to force it to be done on x87.
20387 // TODO: Are there any fast-math-flags to propagate here?
20388 if (IsStrict) {
20389 SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
20390 {Chain, Fild, Fudge});
20391 // STRICT_FP_ROUND can't handle equal types.
20392 if (DstVT == MVT::f80)
20393 return Add;
20394 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20395 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
20396 }
20397 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
20398 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20399 DAG.getIntPtrConstant(0, dl));
20400}
20401
20402// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20403// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20404// just return an SDValue().
20405// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20406// to i16, i32 or i64, and we lower it to a legal sequence and return the
20407// result.
20408SDValue
20409X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20410 bool IsSigned, SDValue &Chain) const {
20411 bool IsStrict = Op->isStrictFPOpcode();
20412 SDLoc DL(Op);
20413
20414 EVT DstTy = Op.getValueType();
20415 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20416 EVT TheVT = Value.getValueType();
20417 auto PtrVT = getPointerTy(DAG.getDataLayout());
20418
20419 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20420 // f16 must be promoted before using the lowering in this routine.
20421 // fp128 does not use this lowering.
20422 return SDValue();
20423 }
20424
20425 // If using FIST to compute an unsigned i64, we'll need some fixup
20426 // to handle values above the maximum signed i64. A FIST is always
20427 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20428 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20429
20430 // FIXME: This does not generate an invalid exception if the input does not
20431 // fit in i32. PR44019
20432 if (!IsSigned && DstTy != MVT::i64) {
20433 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20434 // The low 32 bits of the fist result will have the correct uint32 result.
20435 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")((DstTy == MVT::i32 && "Unexpected FP_TO_UINT") ? static_cast
<void> (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20435, __PRETTY_FUNCTION__))
;
20436 DstTy = MVT::i64;
20437 }
20438
20439 assert(DstTy.getSimpleVT() <= MVT::i64 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20441, __PRETTY_FUNCTION__))
20440 DstTy.getSimpleVT() >= MVT::i16 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20441, __PRETTY_FUNCTION__))
20441 "Unknown FP_TO_INT to lower!")((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20441, __PRETTY_FUNCTION__))
;
20442
20443 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20444 // stack slot.
20445 MachineFunction &MF = DAG.getMachineFunction();
20446 unsigned MemSize = DstTy.getStoreSize();
20447 int SSFI =
20448 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20449 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20450
20451 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20452
20453 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20454
20455 if (UnsignedFixup) {
20456 //
20457 // Conversion to unsigned i64 is implemented with a select,
20458 // depending on whether the source value fits in the range
20459 // of a signed i64. Let Thresh be the FP equivalent of
20460 // 0x8000000000000000ULL.
20461 //
20462 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20463 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20464 // FistSrc = (Value - FltOfs);
20465 // Fist-to-mem64 FistSrc
20466 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20467 // to XOR'ing the high 32 bits with Adjust.
20468 //
20469 // Being a power of 2, Thresh is exactly representable in all FP formats.
20470 // For X87 we'd like to use the smallest FP type for this constant, but
20471 // for DAG type consistency we have to match the FP operand type.
20472
20473 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20474 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
20475 bool LosesInfo = false;
20476 if (TheVT == MVT::f64)
20477 // The rounding mode is irrelevant as the conversion should be exact.
20478 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20479 &LosesInfo);
20480 else if (TheVT == MVT::f80)
20481 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20482 APFloat::rmNearestTiesToEven, &LosesInfo);
20483
20484 assert(Status == APFloat::opOK && !LosesInfo &&((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"
) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20485, __PRETTY_FUNCTION__))
20485 "FP conversion should have been exact")((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"
) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20485, __PRETTY_FUNCTION__))
;
20486
20487 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20488
20489 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20490 *DAG.getContext(), TheVT);
20491 SDValue Cmp;
20492 if (IsStrict) {
20493 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20494 /*IsSignaling*/ true);
20495 Chain = Cmp.getValue(1);
20496 } else {
20497 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20498 }
20499
20500 // Our preferred lowering of
20501 //
20502 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20503 //
20504 // is
20505 //
20506 // (Value >= Thresh) << 63
20507 //
20508 // but since we can get here after LegalOperations, DAGCombine might do the
20509 // wrong thing if we create a select. So, directly create the preferred
20510 // version.
20511 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20512 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20513 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20514
20515 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20516 DAG.getConstantFP(0.0, DL, TheVT));
20517
20518 if (IsStrict) {
20519 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20520 { Chain, Value, FltOfs });
20521 Chain = Value.getValue(1);
20522 } else
20523 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20524 }
20525
20526 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20527
20528 // FIXME This causes a redundant load/store if the SSE-class value is already
20529 // in memory, such as if it is on the callstack.
20530 if (isScalarFPTypeInSSEReg(TheVT)) {
20531 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")((DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? static_cast<void> (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20531, __PRETTY_FUNCTION__))
;
20532 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20533 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20534 SDValue Ops[] = { Chain, StackSlot };
20535
20536 unsigned FLDSize = TheVT.getStoreSize();
20537 assert(FLDSize <= MemSize && "Stack slot not big enough")((FLDSize <= MemSize && "Stack slot not big enough"
) ? static_cast<void> (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20537, __PRETTY_FUNCTION__))
;
20538 MachineMemOperand *MMO = MF.getMachineMemOperand(
20539 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20540 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20541 Chain = Value.getValue(1);
20542 }
20543
20544 // Build the FP_TO_INT*_IN_MEM
20545 MachineMemOperand *MMO = MF.getMachineMemOperand(
20546 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20547 SDValue Ops[] = { Chain, Value, StackSlot };
20548 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
20549 DAG.getVTList(MVT::Other),
20550 Ops, DstTy, MMO);
20551
20552 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
20553 Chain = Res.getValue(1);
20554
20555 // If we need an unsigned fixup, XOR the result with adjust.
20556 if (UnsignedFixup)
20557 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20558
20559 return Res;
20560}
20561
20562static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
20563 const X86Subtarget &Subtarget) {
20564 MVT VT = Op.getSimpleValueType();
20565 SDValue In = Op.getOperand(0);
20566 MVT InVT = In.getSimpleValueType();
20567 SDLoc dl(Op);
20568 unsigned Opc = Op.getOpcode();
20569
20570 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((VT.isVector() && InVT.isVector() && "Expected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20570, __PRETTY_FUNCTION__))
;
20571 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode") ? static_cast<void> (0)
: __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20572, __PRETTY_FUNCTION__))
20572 "Unexpected extension opcode")(((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode") ? static_cast<void> (0)
: __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20572, __PRETTY_FUNCTION__))
;
20573 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20574, __PRETTY_FUNCTION__))
20574 "Expected same number of elements")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20574, __PRETTY_FUNCTION__))
;
20575 assert((VT.getVectorElementType() == MVT::i16 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20578, __PRETTY_FUNCTION__))
20576 VT.getVectorElementType() == MVT::i32 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20578, __PRETTY_FUNCTION__))
20577 VT.getVectorElementType() == MVT::i64) &&(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20578, __PRETTY_FUNCTION__))
20578 "Unexpected element type")(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20578, __PRETTY_FUNCTION__))
;
20579 assert((InVT.getVectorElementType() == MVT::i8 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20582, __PRETTY_FUNCTION__))
20580 InVT.getVectorElementType() == MVT::i16 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20582, __PRETTY_FUNCTION__))
20581 InVT.getVectorElementType() == MVT::i32) &&(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20582, __PRETTY_FUNCTION__))
20582 "Unexpected element type")(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20582, __PRETTY_FUNCTION__))
;
20583
20584 unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
20585
20586 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20587 assert(InVT == MVT::v32i8 && "Unexpected VT!")((InVT == MVT::v32i8 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20587, __PRETTY_FUNCTION__))
;
20588 return splitVectorIntUnary(Op, DAG);
20589 }
20590
20591 if (Subtarget.hasInt256())
20592 return Op;
20593
20594 // Optimize vectors in AVX mode:
20595 //
20596 // v8i16 -> v8i32
20597 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20598 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20599 // Concat upper and lower parts.
20600 //
20601 // v4i32 -> v4i64
20602 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20603 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20604 // Concat upper and lower parts.
20605 //
20606 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20607 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20608
20609 // Short-circuit if we can determine that each 128-bit half is the same value.
20610 // Otherwise, this is difficult to match and optimize.
20611 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20612 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20613 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20614
20615 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20616 SDValue Undef = DAG.getUNDEF(InVT);
20617 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20618 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20619 OpHi = DAG.getBitcast(HalfVT, OpHi);
20620
20621 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20622}
20623
20624// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20625static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20626 const SDLoc &dl, SelectionDAG &DAG) {
20627 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT."
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20627, __PRETTY_FUNCTION__))
;
20628 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20629 DAG.getIntPtrConstant(0, dl));
20630 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20631 DAG.getIntPtrConstant(8, dl));
20632 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20633 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20634 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20635 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20636}
20637
20638static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
20639 const X86Subtarget &Subtarget,
20640 SelectionDAG &DAG) {
20641 MVT VT = Op->getSimpleValueType(0);
20642 SDValue In = Op->getOperand(0);
20643 MVT InVT = In.getSimpleValueType();
20644 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20644, __PRETTY_FUNCTION__))
;
20645 SDLoc DL(Op);
20646 unsigned NumElts = VT.getVectorNumElements();
20647
20648 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20649 // avoids a constant pool load.
20650 if (VT.getVectorElementType() != MVT::i8) {
20651 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20652 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20653 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20654 }
20655
20656 // Extend VT if BWI is not supported.
20657 MVT ExtVT = VT;
20658 if (!Subtarget.hasBWI()) {
20659 // If v16i32 is to be avoided, we'll need to split and concatenate.
20660 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20661 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20662
20663 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20664 }
20665
20666 // Widen to 512-bits if VLX is not supported.
20667 MVT WideVT = ExtVT;
20668 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20669 NumElts *= 512 / ExtVT.getSizeInBits();
20670 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20671 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20672 In, DAG.getIntPtrConstant(0, DL));
20673 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20674 NumElts);
20675 }
20676
20677 SDValue One = DAG.getConstant(1, DL, WideVT);
20678 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20679
20680 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20681
20682 // Truncate if we had to extend above.
20683 if (VT != ExtVT) {
20684 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20685 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20686 }
20687
20688 // Extract back to 128/256-bit if we widened.
20689 if (WideVT != VT)
20690 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20691 DAG.getIntPtrConstant(0, DL));
20692
20693 return SelectedVal;
20694}
20695
20696static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
20697 SelectionDAG &DAG) {
20698 SDValue In = Op.getOperand(0);
20699 MVT SVT = In.getSimpleValueType();
20700
20701 if (SVT.getVectorElementType() == MVT::i1)
20702 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20703
20704 assert(Subtarget.hasAVX() && "Expected AVX support")((Subtarget.hasAVX() && "Expected AVX support") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20704, __PRETTY_FUNCTION__))
;
20705 return LowerAVXExtend(Op, DAG, Subtarget);
20706}
20707
20708/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20709/// It makes use of the fact that vectors with enough leading sign/zero bits
20710/// prevent the PACKSS/PACKUS from saturating the results.
20711/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20712/// within each 128-bit lane.
20713static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20714 const SDLoc &DL, SelectionDAG &DAG,
20715 const X86Subtarget &Subtarget) {
20716 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode") ? static_cast<void> (0) : __assert_fail
("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20717, __PRETTY_FUNCTION__))
20717 "Unexpected PACK opcode")(((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode") ? static_cast<void> (0) : __assert_fail
("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20717, __PRETTY_FUNCTION__))
;
20718 assert(DstVT.isVector() && "VT not a vector?")((DstVT.isVector() && "VT not a vector?") ? static_cast
<void> (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20718, __PRETTY_FUNCTION__))
;
20719
20720 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20721 if (!Subtarget.hasSSE2())
20722 return SDValue();
20723
20724 EVT SrcVT = In.getValueType();
20725
20726 // No truncation required, we might get here due to recursive calls.
20727 if (SrcVT == DstVT)
20728 return In;
20729
20730 // We only support vector truncation to 64bits or greater from a
20731 // 128bits or greater source.
20732 unsigned DstSizeInBits = DstVT.getSizeInBits();
20733 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20734 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
20735 return SDValue();
20736
20737 unsigned NumElems = SrcVT.getVectorNumElements();
20738 if (!isPowerOf2_32(NumElems))
20739 return SDValue();
20740
20741 LLVMContext &Ctx = *DAG.getContext();
20742 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")((DstVT.getVectorNumElements() == NumElems && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20742, __PRETTY_FUNCTION__))
;
20743 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")((SrcSizeInBits > DstSizeInBits && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20743, __PRETTY_FUNCTION__))
;
20744
20745 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20746
20747 // Pack to the largest type possible:
20748 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20749 EVT InVT = MVT::i16, OutVT = MVT::i8;
20750 if (SrcVT.getScalarSizeInBits() > 16 &&
20751 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20752 InVT = MVT::i32;
20753 OutVT = MVT::i16;
20754 }
20755
20756 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
20757 if (SrcVT.is128BitVector()) {
20758 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20759 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20760 In = DAG.getBitcast(InVT, In);
20761 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
20762 Res = extractSubVector(Res, 0, DAG, DL, 64);
20763 return DAG.getBitcast(DstVT, Res);
20764 }
20765
20766 // Split lower/upper subvectors.
20767 SDValue Lo, Hi;
20768 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20769
20770 unsigned SubSizeInBits = SrcSizeInBits / 2;
20771 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20772 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20773
20774 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20775 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20776 Lo = DAG.getBitcast(InVT, Lo);
20777 Hi = DAG.getBitcast(InVT, Hi);
20778 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20779 return DAG.getBitcast(DstVT, Res);
20780 }
20781
20782 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20783 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20784 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20785 Lo = DAG.getBitcast(InVT, Lo);
20786 Hi = DAG.getBitcast(InVT, Hi);
20787 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20788
20789 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20790 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20791 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20792 SmallVector<int, 64> Mask;
20793 int Scale = 64 / OutVT.getScalarSizeInBits();
20794 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20795 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20796
20797 if (DstVT.is256BitVector())
20798 return DAG.getBitcast(DstVT, Res);
20799
20800 // If 512bit -> 128bit truncate another stage.
20801 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20802 Res = DAG.getBitcast(PackedVT, Res);
20803 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20804 }
20805
20806 // Recursively pack lower/upper subvectors, concat result and pack again.
20807 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")((SrcSizeInBits >= 256 && "Expected 256-bit vector or greater"
) ? static_cast<void> (0) : __assert_fail ("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20807, __PRETTY_FUNCTION__))
;
20808 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20809 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
20810 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
20811
20812 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20813 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20814 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20815}
20816
20817static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
20818 const X86Subtarget &Subtarget) {
20819
20820 SDLoc DL(Op);
20821 MVT VT = Op.getSimpleValueType();
20822 SDValue In = Op.getOperand(0);
20823 MVT InVT = In.getSimpleValueType();
20824
20825 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")((VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20825, __PRETTY_FUNCTION__))
;
20826
20827 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20828 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20829 if (InVT.getScalarSizeInBits() <= 16) {
20830 if (Subtarget.hasBWI()) {
20831 // legal, will go to VPMOVB2M, VPMOVW2M
20832 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20833 // We need to shift to get the lsb into sign position.
20834 // Shift packed bytes not supported natively, bitcast to word
20835 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20836 In = DAG.getNode(ISD::SHL, DL, ExtVT,
20837 DAG.getBitcast(ExtVT, In),
20838 DAG.getConstant(ShiftInx, DL, ExtVT));
20839 In = DAG.getBitcast(InVT, In);
20840 }
20841 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
20842 In, ISD::SETGT);
20843 }
20844 // Use TESTD/Q, extended vector to packed dword/qword.
20845 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.") ? static_cast<void> (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20846, __PRETTY_FUNCTION__))
20846 "Unexpected vector type.")(((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.") ? static_cast<void> (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20846, __PRETTY_FUNCTION__))
;
20847 unsigned NumElts = InVT.getVectorNumElements();
20848 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(((NumElts == 8 || NumElts == 16) && "Unexpected number of elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20848, __PRETTY_FUNCTION__))
;
20849 // We need to change to a wider element type that we have support for.
20850 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
20851 // For 16 element vectors we extend to v16i32 unless we are explicitly
20852 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20853 // we need to split into two 8 element vectors which we can extend to v8i32,
20854 // truncate and concat the results. There's an additional complication if
20855 // the original type is v16i8. In that case we can't split the v16i8
20856 // directly, so we need to shuffle high elements to low and use
20857 // sign_extend_vector_inreg.
20858 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
20859 SDValue Lo, Hi;
20860 if (InVT == MVT::v16i8) {
20861 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
20862 Hi = DAG.getVectorShuffle(
20863 InVT, DL, In, In,
20864 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
20865 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
20866 } else {
20867 assert(InVT == MVT::v16i16 && "Unexpected VT!")((InVT == MVT::v16i16 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20867, __PRETTY_FUNCTION__))
;
20868 Lo = extract128BitVector(In, 0, DAG, DL);
20869 Hi = extract128BitVector(In, 8, DAG, DL);
20870 }
20871 // We're split now, just emit two truncates and a concat. The two
20872 // truncates will trigger legalization to come back to this function.
20873 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
20874 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
20875 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20876 }
20877 // We either have 8 elements or we're allowed to use 512-bit vectors.
20878 // If we have VLX, we want to use the narrowest vector that can get the
20879 // job done so we use vXi32.
20880 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
20881 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
20882 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
20883 InVT = ExtVT;
20884 ShiftInx = InVT.getScalarSizeInBits() - 1;
20885 }
20886
20887 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20888 // We need to shift to get the lsb into sign position.
20889 In = DAG.getNode(ISD::SHL, DL, InVT, In,
20890 DAG.getConstant(ShiftInx, DL, InVT));
20891 }
20892 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
20893 if (Subtarget.hasDQI())
20894 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
20895 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
20896}
20897
20898SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
20899 SDLoc DL(Op);
20900 MVT VT = Op.getSimpleValueType();
20901 SDValue In = Op.getOperand(0);
20902 MVT InVT = In.getSimpleValueType();
20903 unsigned InNumEltBits = InVT.getScalarSizeInBits();
20904
20905 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20906, __PRETTY_FUNCTION__))
20906 "Invalid TRUNCATE operation")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20906, __PRETTY_FUNCTION__))
;
20907
20908 // If we're called by the type legalizer, handle a few cases.
20909 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20910 if (!TLI.isTypeLegal(InVT)) {
20911 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
20912 VT.is128BitVector()) {
20913 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(((InVT == MVT::v16i64 || Subtarget.hasVLX()) && "Unexpected subtarget!"
) ? static_cast<void> (0) : __assert_fail ("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20914, __PRETTY_FUNCTION__))
20914 "Unexpected subtarget!")(((InVT == MVT::v16i64 || Subtarget.hasVLX()) && "Unexpected subtarget!"
) ? static_cast<void> (0) : __assert_fail ("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20914, __PRETTY_FUNCTION__))
;
20915 // The default behavior is to truncate one step, concatenate, and then
20916 // truncate the remainder. We'd rather produce two 64-bit results and
20917 // concatenate those.
20918 SDValue Lo, Hi;
20919 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
20920
20921 EVT LoVT, HiVT;
20922 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
20923
20924 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
20925 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
20926 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20927 }
20928
20929 // Otherwise let default legalization handle it.
20930 return SDValue();
20931 }
20932
20933 if (VT.getVectorElementType() == MVT::i1)
20934 return LowerTruncateVecI1(Op, DAG, Subtarget);
20935
20936 // vpmovqb/w/d, vpmovdb/w, vpmovwb
20937 if (Subtarget.hasAVX512()) {
20938 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
20939 assert(VT == MVT::v32i8 && "Unexpected VT!")((VT == MVT::v32i8 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20939, __PRETTY_FUNCTION__))
;
20940 return splitVectorIntUnary(Op, DAG);
20941 }
20942
20943 // word to byte only under BWI. Otherwise we have to promoted to v16i32
20944 // and then truncate that. But we should only do that if we haven't been
20945 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20946 // handled by isel patterns.
20947 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
20948 Subtarget.canExtendTo512DQ())
20949 return Op;
20950 }
20951
20952 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
20953 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20954
20955 // Truncate with PACKUS if we are truncating a vector with leading zero bits
20956 // that extend all the way to the packed/truncated value.
20957 // Pre-SSE41 we can only use PACKUSWB.
20958 KnownBits Known = DAG.computeKnownBits(In);
20959 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
20960 if (SDValue V =
20961 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
20962 return V;
20963
20964 // Truncate with PACKSS if we are truncating a vector with sign-bits that
20965 // extend all the way to the packed/truncated value.
20966 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
20967 if (SDValue V =
20968 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
20969 return V;
20970
20971 // Handle truncation of V256 to V128 using shuffles.
20972 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")((VT.is128BitVector() && InVT.is256BitVector() &&
"Unexpected types!") ? static_cast<void> (0) : __assert_fail
("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20972, __PRETTY_FUNCTION__))
;
20973
20974 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
20975 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20976 if (Subtarget.hasInt256()) {
20977 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20978 In = DAG.getBitcast(MVT::v8i32, In);
20979 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
20980 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
20981 DAG.getIntPtrConstant(0, DL));
20982 }
20983
20984 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20985 DAG.getIntPtrConstant(0, DL));
20986 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20987 DAG.getIntPtrConstant(2, DL));
20988 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
20989 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
20990 static const int ShufMask[] = {0, 2, 4, 6};
20991 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
20992 }
20993
20994 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
20995 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20996 if (Subtarget.hasInt256()) {
20997 In = DAG.getBitcast(MVT::v32i8, In);
20998
20999 // The PSHUFB mask:
21000 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21001 -1, -1, -1, -1, -1, -1, -1, -1,
21002 16, 17, 20, 21, 24, 25, 28, 29,
21003 -1, -1, -1, -1, -1, -1, -1, -1 };
21004 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21005 In = DAG.getBitcast(MVT::v4i64, In);
21006
21007 static const int ShufMask2[] = {0, 2, -1, -1};
21008 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21009 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21010 DAG.getIntPtrConstant(0, DL));
21011 return DAG.getBitcast(VT, In);
21012 }
21013
21014 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21015 DAG.getIntPtrConstant(0, DL));
21016
21017 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21018 DAG.getIntPtrConstant(4, DL));
21019
21020 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
21021 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
21022
21023 // The PSHUFB mask:
21024 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
21025 -1, -1, -1, -1, -1, -1, -1, -1};
21026
21027 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21028 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21029
21030 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21031 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21032
21033 // The MOVLHPS Mask:
21034 static const int ShufMask2[] = {0, 1, 4, 5};
21035 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21036 return DAG.getBitcast(MVT::v8i16, res);
21037 }
21038
21039 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21040 // Use an AND to zero uppper bits for PACKUS.
21041 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21042
21043 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21044 DAG.getIntPtrConstant(0, DL));
21045 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21046 DAG.getIntPtrConstant(8, DL));
21047 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21048 }
21049
21050 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21050)
;
21051}
21052
21053SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21054 bool IsStrict = Op->isStrictFPOpcode();
21055 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21056 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21057 MVT VT = Op->getSimpleValueType(0);
21058 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21059 MVT SrcVT = Src.getSimpleValueType();
21060 SDLoc dl(Op);
21061
21062 if (VT.isVector()) {
21063 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21064 MVT ResVT = MVT::v4i32;
21065 MVT TruncVT = MVT::v4i1;
21066 unsigned Opc;
21067 if (IsStrict)
21068 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
21069 else
21070 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21071
21072 if (!IsSigned && !Subtarget.hasVLX()) {
21073 assert(Subtarget.useAVX512Regs() && "Unexpected features!")((Subtarget.useAVX512Regs() && "Unexpected features!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21073, __PRETTY_FUNCTION__))
;
21074 // Widen to 512-bits.
21075 ResVT = MVT::v8i32;
21076 TruncVT = MVT::v8i1;
21077 Opc = Op.getOpcode();
21078 // Need to concat with zero vector for strict fp to avoid spurious
21079 // exceptions.
21080 // TODO: Should we just do this for non-strict as well?
21081 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21082 : DAG.getUNDEF(MVT::v8f64);
21083 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21084 DAG.getIntPtrConstant(0, dl));
21085 }
21086 SDValue Res, Chain;
21087 if (IsStrict) {
21088 Res =
21089 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
21090 Chain = Res.getValue(1);
21091 } else {
21092 Res = DAG.getNode(Opc, dl, ResVT, Src);
21093 }
21094
21095 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21096 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21097 DAG.getIntPtrConstant(0, dl));
21098 if (IsStrict)
21099 return DAG.getMergeValues({Res, Chain}, dl);
21100 return Res;
21101 }
21102
21103 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21104 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21105 assert(!IsSigned && "Expected unsigned conversion!")((!IsSigned && "Expected unsigned conversion!") ? static_cast
<void> (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21105, __PRETTY_FUNCTION__))
;
21106 assert(Subtarget.useAVX512Regs() && "Requires avx512f")((Subtarget.useAVX512Regs() && "Requires avx512f") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21106, __PRETTY_FUNCTION__))
;
21107 return Op;
21108 }
21109
21110 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21111 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21112 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {
21113 assert(!IsSigned && "Expected unsigned conversion!")((!IsSigned && "Expected unsigned conversion!") ? static_cast
<void> (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21113, __PRETTY_FUNCTION__))
;
21114 assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&((Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
"Unexpected features!") ? static_cast<void> (0) : __assert_fail
("Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21115, __PRETTY_FUNCTION__))
21115 "Unexpected features!")((Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
"Unexpected features!") ? static_cast<void> (0) : __assert_fail
("Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21115, __PRETTY_FUNCTION__))
;
21116 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21117 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21118 // Need to concat with zero vector for strict fp to avoid spurious
21119 // exceptions.
21120 // TODO: Should we just do this for non-strict as well?
21121 SDValue Tmp =
21122 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21123 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21124 DAG.getIntPtrConstant(0, dl));
21125
21126 SDValue Res, Chain;
21127 if (IsStrict) {
21128 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21129 {Op->getOperand(0), Src});
21130 Chain = Res.getValue(1);
21131 } else {
21132 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21133 }
21134
21135 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21136 DAG.getIntPtrConstant(0, dl));
21137
21138 if (IsStrict)
21139 return DAG.getMergeValues({Res, Chain}, dl);
21140 return Res;
21141 }
21142
21143 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21144 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21145 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {
21146 assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&((Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
!Subtarget.hasVLX() && "Unexpected features!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && Subtarget.hasDQI() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21147, __PRETTY_FUNCTION__))
21147 !Subtarget.hasVLX() && "Unexpected features!")((Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
!Subtarget.hasVLX() && "Unexpected features!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && Subtarget.hasDQI() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21147, __PRETTY_FUNCTION__))
;
21148 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21149 // Need to concat with zero vector for strict fp to avoid spurious
21150 // exceptions.
21151 // TODO: Should we just do this for non-strict as well?
21152 SDValue Tmp =
21153 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21154 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21155 DAG.getIntPtrConstant(0, dl));
21156
21157 SDValue Res, Chain;
21158 if (IsStrict) {
21159 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21160 {Op->getOperand(0), Src});
21161 Chain = Res.getValue(1);
21162 } else {
21163 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21164 }
21165
21166 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21167 DAG.getIntPtrConstant(0, dl));
21168
21169 if (IsStrict)
21170 return DAG.getMergeValues({Res, Chain}, dl);
21171 return Res;
21172 }
21173
21174 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21175 if (!Subtarget.hasVLX()) {
21176 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21177 // legalizer and then widened again by vector op legalization.
21178 if (!IsStrict)
21179 return SDValue();
21180
21181 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21182 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21183 {Src, Zero, Zero, Zero});
21184 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21185 {Op->getOperand(0), Tmp});
21186 SDValue Chain = Tmp.getValue(1);
21187 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21188 DAG.getIntPtrConstant(0, dl));
21189 if (IsStrict)
21190 return DAG.getMergeValues({Tmp, Chain}, dl);
21191 return Tmp;
21192 }
21193
21194 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")((Subtarget.hasDQI() && Subtarget.hasVLX() &&
"Requires AVX512DQVL") ? static_cast<void> (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21194, __PRETTY_FUNCTION__))
;
21195 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21196 DAG.getUNDEF(MVT::v2f32));
21197 if (IsStrict) {
21198 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21199 : X86ISD::STRICT_CVTTP2UI;
21200 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21201 }
21202 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21203 return DAG.getNode(Opc, dl, VT, Tmp);
21204 }
21205
21206 return SDValue();
21207 }
21208
21209 assert(!VT.isVector())((!VT.isVector()) ? static_cast<void> (0) : __assert_fail
("!VT.isVector()", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21209, __PRETTY_FUNCTION__))
;
21210
21211 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21212
21213 if (!IsSigned && UseSSEReg) {
21214 // Conversions from f32/f64 with AVX512 should be legal.
21215 if (Subtarget.hasAVX512())
21216 return Op;
21217
21218 // Use default expansion for i64.
21219 if (VT == MVT::i64)
21220 return SDValue();
21221
21222 assert(VT == MVT::i32 && "Unexpected VT!")((VT == MVT::i32 && "Unexpected VT!") ? static_cast<
void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21222, __PRETTY_FUNCTION__))
;
21223
21224 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21225 // FIXME: This does not generate an invalid exception if the input does not
21226 // fit in i32. PR44019
21227 if (Subtarget.is64Bit()) {
21228 SDValue Res, Chain;
21229 if (IsStrict) {
21230 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
21231 { Op.getOperand(0), Src });
21232 Chain = Res.getValue(1);
21233 } else
21234 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21235
21236 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21237 if (IsStrict)
21238 return DAG.getMergeValues({ Res, Chain }, dl);
21239 return Res;
21240 }
21241
21242 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21243 // use fisttp which will be handled later.
21244 if (!Subtarget.hasSSE3())
21245 return SDValue();
21246 }
21247
21248 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21249 // FIXME: This does not generate an invalid exception if the input does not
21250 // fit in i16. PR44019
21251 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21252 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")((IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? static_cast<void> (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21252, __PRETTY_FUNCTION__))
;
21253 SDValue Res, Chain;
21254 if (IsStrict) {
21255 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
21256 { Op.getOperand(0), Src });
21257 Chain = Res.getValue(1);
21258 } else
21259 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21260
21261 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21262 if (IsStrict)
21263 return DAG.getMergeValues({ Res, Chain }, dl);
21264 return Res;
21265 }
21266
21267 // If this is a FP_TO_SINT using SSEReg we're done.
21268 if (UseSSEReg && IsSigned)
21269 return Op;
21270
21271 // fp128 needs to use a libcall.
21272 if (SrcVT == MVT::f128) {
21273 RTLIB::Libcall LC;
21274 if (IsSigned)
21275 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21276 else
21277 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21278
21279 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21280 MakeLibCallOptions CallOptions;
21281 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
21282 SDLoc(Op), Chain);
21283
21284 if (IsStrict)
21285 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21286
21287 return Tmp.first;
21288 }
21289
21290 // Fall back to X87.
21291 SDValue Chain;
21292 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21293 if (IsStrict)
21294 return DAG.getMergeValues({V, Chain}, dl);
21295 return V;
21296 }
21297
21298 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21298)
;
21299}
21300
21301SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21302 SelectionDAG &DAG) const {
21303 SDValue Src = Op.getOperand(0);
21304 MVT SrcVT = Src.getSimpleValueType();
21305
21306 // If the source is in an SSE register, the node is Legal.
21307 if (isScalarFPTypeInSSEReg(SrcVT))
21308 return Op;
21309
21310 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21311}
21312
21313SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21314 SelectionDAG &DAG) const {
21315 EVT DstVT = N->getValueType(0);
21316 SDValue Src = N->getOperand(0);
21317 EVT SrcVT = Src.getValueType();
21318
21319 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21320 // f16 must be promoted before using the lowering in this routine.
21321 // fp128 does not use this lowering.
21322 return SDValue();
21323 }
21324
21325 SDLoc DL(N);
21326 SDValue Chain = DAG.getEntryNode();
21327
21328 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21329
21330 // If we're converting from SSE, the stack slot needs to hold both types.
21331 // Otherwise it only needs to hold the DstVT.
21332 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21333 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21334 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21335 MachinePointerInfo MPI =
21336 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
21337
21338 if (UseSSE) {
21339 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")((DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? static_cast<void> (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21339, __PRETTY_FUNCTION__))
;
21340 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21341 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21342 SDValue Ops[] = { Chain, StackPtr };
21343
21344 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21345 /*Align*/ None, MachineMemOperand::MOLoad);
21346 Chain = Src.getValue(1);
21347 }
21348
21349 SDValue StoreOps[] = { Chain, Src, StackPtr };
21350 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21351 StoreOps, DstVT, MPI, /*Align*/ None,
21352 MachineMemOperand::MOStore);
21353
21354 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21355}
21356
21357SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21358 bool IsStrict = Op->isStrictFPOpcode();
21359
21360 SDLoc DL(Op);
21361 MVT VT = Op.getSimpleValueType();
21362 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21363 MVT SVT = In.getSimpleValueType();
21364
21365 if (VT == MVT::f128) {
21366 RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
21367 return LowerF128Call(Op, DAG, LC);
21368 }
21369
21370 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")((SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? static_cast<void> (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21370, __PRETTY_FUNCTION__))
;
21371
21372 SDValue Res =
21373 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21374 if (IsStrict)
21375 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21376 {Op->getOperand(0), Res});
21377 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21378}
21379
21380SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21381 bool IsStrict = Op->isStrictFPOpcode();
21382
21383 MVT VT = Op.getSimpleValueType();
21384 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21385 MVT SVT = In.getSimpleValueType();
21386
21387 // It's legal except when f128 is involved
21388 if (SVT != MVT::f128)
21389 return Op;
21390
21391 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);
21392
21393 // FP_ROUND node has a second operand indicating whether it is known to be
21394 // precise. That doesn't take part in the LibCall so we can't directly use
21395 // LowerF128Call.
21396
21397 SDLoc dl(Op);
21398 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21399 MakeLibCallOptions CallOptions;
21400 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
21401 dl, Chain);
21402
21403 if (IsStrict)
21404 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21405
21406 return Tmp.first;
21407}
21408
21409static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
21410 bool IsStrict = Op->isStrictFPOpcode();
21411 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21412 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&((Src.getValueType() == MVT::i16 && Op.getValueType()
== MVT::f32 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21413, __PRETTY_FUNCTION__))
21413 "Unexpected VT!")((Src.getValueType() == MVT::i16 && Op.getValueType()
== MVT::f32 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21413, __PRETTY_FUNCTION__))
;
21414
21415 SDLoc dl(Op);
21416 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21417 DAG.getConstant(0, dl, MVT::v8i16), Src,
21418 DAG.getIntPtrConstant(0, dl));
21419
21420 SDValue Chain;
21421 if (IsStrict) {
21422 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21423 {Op.getOperand(0), Res});
21424 Chain = Res.getValue(1);
21425 } else {
21426 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21427 }
21428
21429 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21430 DAG.getIntPtrConstant(0, dl));
21431
21432 if (IsStrict)
21433 return DAG.getMergeValues({Res, Chain}, dl);
21434
21435 return Res;
21436}
21437
21438static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
21439 bool IsStrict = Op->isStrictFPOpcode();
21440 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21441 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&((Src.getValueType() == MVT::f32 && Op.getValueType()
== MVT::i16 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21442, __PRETTY_FUNCTION__))
21442 "Unexpected VT!")((Src.getValueType() == MVT::f32 && Op.getValueType()
== MVT::i16 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21442, __PRETTY_FUNCTION__))
;
21443
21444 SDLoc dl(Op);
21445 SDValue Res, Chain;
21446 if (IsStrict) {
21447 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21448 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21449 DAG.getIntPtrConstant(0, dl));
21450 Res = DAG.getNode(
21451 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21452 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21453 Chain = Res.getValue(1);
21454 } else {
21455 // FIXME: Should we use zeros for upper elements for non-strict?
21456 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21457 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21458 DAG.getTargetConstant(4, dl, MVT::i32));
21459 }
21460
21461 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21462 DAG.getIntPtrConstant(0, dl));
21463
21464 if (IsStrict)
21465 return DAG.getMergeValues({Res, Chain}, dl);
21466
21467 return Res;
21468}
21469
21470/// Depending on uarch and/or optimizing for size, we might prefer to use a
21471/// vector operation in place of the typical scalar operation.
21472static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
21473 const X86Subtarget &Subtarget) {
21474 // If both operands have other uses, this is probably not profitable.
21475 SDValue LHS = Op.getOperand(0);
21476 SDValue RHS = Op.getOperand(1);
21477 if (!LHS.hasOneUse() && !RHS.hasOneUse())
21478 return Op;
21479
21480 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21481 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21482 if (IsFP && !Subtarget.hasSSE3())
21483 return Op;
21484 if (!IsFP && !Subtarget.hasSSSE3())
21485 return Op;
21486
21487 // Extract from a common vector.
21488 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21489 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21490 LHS.getOperand(0) != RHS.getOperand(0) ||
21491 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21492 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21493 !shouldUseHorizontalOp(true, DAG, Subtarget))
21494 return Op;
21495
21496 // Allow commuted 'hadd' ops.
21497 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21498 unsigned HOpcode;
21499 switch (Op.getOpcode()) {
21500 case ISD::ADD: HOpcode = X86ISD::HADD; break;
21501 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21502 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21503 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21504 default:
21505 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21505)
;
21506 }
21507 unsigned LExtIndex = LHS.getConstantOperandVal(1);
21508 unsigned RExtIndex = RHS.getConstantOperandVal(1);
21509 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21510 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21511 std::swap(LExtIndex, RExtIndex);
21512
21513 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21514 return Op;
21515
21516 SDValue X = LHS.getOperand(0);
21517 EVT VecVT = X.getValueType();
21518 unsigned BitWidth = VecVT.getSizeInBits();
21519 unsigned NumLanes = BitWidth / 128;
21520 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21521 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here") ? static_cast<
void> (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21522, __PRETTY_FUNCTION__))
21522 "Not expecting illegal vector widths here")(((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here") ? static_cast<
void> (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21522, __PRETTY_FUNCTION__))
;
21523
21524 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21525 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21526 SDLoc DL(Op);
21527 if (BitWidth == 256 || BitWidth == 512) {
21528 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21529 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21530 LExtIndex %= NumEltsPerLane;
21531 }
21532
21533 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21534 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21535 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21536 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21537 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21538 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21539 DAG.getIntPtrConstant(LExtIndex / 2, DL));
21540}
21541
21542/// Depending on uarch and/or optimizing for size, we might prefer to use a
21543/// vector operation in place of the typical scalar operation.
21544SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21545 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::
f64) && "Only expecting float/double") ? static_cast<
void> (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21546, __PRETTY_FUNCTION__))
21546 "Only expecting float/double")(((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::
f64) && "Only expecting float/double") ? static_cast<
void> (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21546, __PRETTY_FUNCTION__))
;
21547 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
21548}
21549
21550/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21551/// This mode isn't supported in hardware on X86. But as long as we aren't
21552/// compiling with trapping math, we can emulate this with
21553/// floor(X + copysign(nextafter(0.5, 0.0), X)).
21554static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
21555 SDValue N0 = Op.getOperand(0);
21556 SDLoc dl(Op);
21557 MVT VT = Op.getSimpleValueType();
21558
21559 // N0 += copysign(nextafter(0.5, 0.0), N0)
21560 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21561 bool Ignored;
21562 APFloat Point5Pred = APFloat(0.5f);
21563 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
21564 Point5Pred.next(/*nextDown*/true);
21565
21566 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
21567 DAG.getConstantFP(Point5Pred, dl, VT), N0);
21568 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
21569
21570 // Truncate the result to remove fraction.
21571 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
21572}
21573
21574/// The only differences between FABS and FNEG are the mask and the logic op.
21575/// FNEG also has a folding opportunity for FNEG(FABS(x)).
21576static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
21577 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21578, __PRETTY_FUNCTION__))
21578 "Wrong opcode for lowering FABS or FNEG.")(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21578, __PRETTY_FUNCTION__))
;
21579
21580 bool IsFABS = (Op.getOpcode() == ISD::FABS);
21581
21582 // If this is a FABS and it has an FNEG user, bail out to fold the combination
21583 // into an FNABS. We'll lower the FABS after that if it is still in use.
21584 if (IsFABS)
21585 for (SDNode *User : Op->uses())
21586 if (User->getOpcode() == ISD::FNEG)
21587 return Op;
21588
21589 SDLoc dl(Op);
21590 MVT VT = Op.getSimpleValueType();
21591
21592 bool IsF128 = (VT == MVT::f128);
21593 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21596, __PRETTY_FUNCTION__))
21594 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21596, __PRETTY_FUNCTION__))
21595 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21596, __PRETTY_FUNCTION__))
21596 "Unexpected type in LowerFABSorFNEG")(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21596, __PRETTY_FUNCTION__))
;
21597
21598 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
21599 // decide if we should generate a 16-byte constant mask when we only need 4 or
21600 // 8 bytes for the scalar case.
21601
21602 // There are no scalar bitwise logical SSE/AVX instructions, so we
21603 // generate a 16-byte vector constant and logic op even for the scalar case.
21604 // Using a 16-byte mask allows folding the load of the mask with
21605 // the logic op, so it can save (~4 bytes) on code size.
21606 bool IsFakeVector = !VT.isVector() && !IsF128;
21607 MVT LogicVT = VT;
21608 if (IsFakeVector)
21609 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
21610
21611 unsigned EltBits = VT.getScalarSizeInBits();
21612 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
21613 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
21614 APInt::getSignMask(EltBits);
21615 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21616 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
21617
21618 SDValue Op0 = Op.getOperand(0);
21619 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
21620 unsigned LogicOp = IsFABS ? X86ISD::FAND :
21621 IsFNABS ? X86ISD::FOR :
21622 X86ISD::FXOR;
21623 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
21624
21625 if (VT.isVector() || IsF128)
21626 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21627
21628 // For the scalar case extend to a 128-bit vector, perform the logic op,
21629 // and extract the scalar result back out.
21630 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
21631 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21632 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
21633 DAG.getIntPtrConstant(0, dl));
21634}
21635
21636static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
21637 SDValue Mag = Op.getOperand(0);
21638 SDValue Sign = Op.getOperand(1);
21639 SDLoc dl(Op);
21640
21641 // If the sign operand is smaller, extend it first.
21642 MVT VT = Op.getSimpleValueType();
21643 if (Sign.getSimpleValueType().bitsLT(VT))
21644 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
21645
21646 // And if it is bigger, shrink it first.
21647 if (Sign.getSimpleValueType().bitsGT(VT))
21648 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
21649
21650 // At this point the operands and the result should have the same
21651 // type, and that won't be f80 since that is not custom lowered.
21652 bool IsF128 = (VT == MVT::f128);
21653 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21656, __PRETTY_FUNCTION__))
21654 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21656, __PRETTY_FUNCTION__))
21655 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21656, __PRETTY_FUNCTION__))
21656 "Unexpected type in LowerFCOPYSIGN")(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21656, __PRETTY_FUNCTION__))
;
21657
21658 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21659
21660 // Perform all scalar logic operations as 16-byte vectors because there are no
21661 // scalar FP logic instructions in SSE.
21662 // TODO: This isn't necessary. If we used scalar types, we might avoid some
21663 // unnecessary splats, but we might miss load folding opportunities. Should
21664 // this decision be based on OptimizeForSize?
21665 bool IsFakeVector = !VT.isVector() && !IsF128;
21666 MVT LogicVT = VT;
21667 if (IsFakeVector)
21668 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
21669
21670 // The mask constants are automatically splatted for vector types.
21671 unsigned EltSizeInBits = VT.getScalarSizeInBits();
21672 SDValue SignMask = DAG.getConstantFP(
21673 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
21674 SDValue MagMask = DAG.getConstantFP(
21675 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
21676
21677 // First, clear all bits but the sign bit from the second operand (sign).
21678 if (IsFakeVector)
21679 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
21680 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
21681
21682 // Next, clear the sign bit from the first operand (magnitude).
21683 // TODO: If we had general constant folding for FP logic ops, this check
21684 // wouldn't be necessary.
21685 SDValue MagBits;
21686 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
21687 APFloat APF = Op0CN->getValueAPF();
21688 APF.clearSign();
21689 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
21690 } else {
21691 // If the magnitude operand wasn't a constant, we need to AND out the sign.
21692 if (IsFakeVector)
21693 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
21694 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
21695 }
21696
21697 // OR the magnitude value with the sign bit.
21698 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
21699 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
21700 DAG.getIntPtrConstant(0, dl));
21701}
21702
21703static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
21704 SDValue N0 = Op.getOperand(0);
21705 SDLoc dl(Op);
21706 MVT VT = Op.getSimpleValueType();
21707
21708 MVT OpVT = N0.getSimpleValueType();
21709 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"
) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21710, __PRETTY_FUNCTION__))
21710 "Unexpected type for FGETSIGN")(((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"
) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21710, __PRETTY_FUNCTION__))
;
21711
21712 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
21713 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
21714 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
21715 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
21716 Res = DAG.getZExtOrTrunc(Res, dl, VT);
21717 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
21718 return Res;
21719}
21720
21721/// Helper for creating a X86ISD::SETCC node.
21722static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
21723 SelectionDAG &DAG) {
21724 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
21725 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
21726}
21727
21728/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
21729/// style scalarized (associative) reduction patterns. Partial reductions
21730/// are supported when the pointer SrcMask is non-null.
21731/// TODO - move this to SelectionDAG?
21732static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
21733 SmallVectorImpl<SDValue> &SrcOps,
21734 SmallVectorImpl<APInt> *SrcMask = nullptr) {
21735 SmallVector<SDValue, 8> Opnds;
21736 DenseMap<SDValue, APInt> SrcOpMap;
21737 EVT VT = MVT::Other;
21738
21739 // Recognize a special case where a vector is casted into wide integer to
21740 // test all 0s.
21741 assert(Op.getOpcode() == unsigned(BinOp) &&((Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21742, __PRETTY_FUNCTION__))
21742 "Unexpected bit reduction opcode")((Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21742, __PRETTY_FUNCTION__))
;
21743 Opnds.push_back(Op.getOperand(0));
21744 Opnds.push_back(Op.getOperand(1));
21745
21746 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
21747 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
21748 // BFS traverse all BinOp operands.
21749 if (I->getOpcode() == unsigned(BinOp)) {
21750 Opnds.push_back(I->getOperand(0));
21751 Opnds.push_back(I->getOperand(1));
21752 // Re-evaluate the number of nodes to be traversed.
21753 e += 2; // 2 more nodes (LHS and RHS) are pushed.
21754 continue;
21755 }
21756
21757 // Quit if a non-EXTRACT_VECTOR_ELT
21758 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
21759 return false;
21760
21761 // Quit if without a constant index.
21762 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
21763 if (!Idx)
21764 return false;
21765
21766 SDValue Src = I->getOperand(0);
21767 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
21768 if (M == SrcOpMap.end()) {
21769 VT = Src.getValueType();
21770 // Quit if not the same type.
21771 if (SrcOpMap.begin() != SrcOpMap.end() &&
21772 VT != SrcOpMap.begin()->first.getValueType())
21773 return false;
21774 unsigned NumElts = VT.getVectorNumElements();
21775 APInt EltCount = APInt::getNullValue(NumElts);
21776 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
21777 SrcOps.push_back(Src);
21778 }
21779
21780 // Quit if element already used.
21781 unsigned CIdx = Idx->getZExtValue();
21782 if (M->second[CIdx])
21783 return false;
21784 M->second.setBit(CIdx);
21785 }
21786
21787 if (SrcMask) {
21788 // Collect the source partial masks.
21789 for (SDValue &SrcOp : SrcOps)
21790 SrcMask->push_back(SrcOpMap[SrcOp]);
21791 } else {
21792 // Quit if not all elements are used.
21793 for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
21794 E = SrcOpMap.end();
21795 I != E; ++I) {
21796 if (!I->second.isAllOnesValue())
21797 return false;
21798 }
21799 }
21800
21801 return true;
21802}
21803
21804// Helper function for comparing all bits of a vector against zero.
21805static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
21806 const APInt &Mask,
21807 const X86Subtarget &Subtarget,
21808 SelectionDAG &DAG, X86::CondCode &X86CC) {
21809 EVT VT = V.getValueType();
21810 unsigned ScalarSize = VT.getScalarSizeInBits();
21811 if (Mask.getBitWidth() != ScalarSize) {
21812 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")((ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? static_cast<void> (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21812, __PRETTY_FUNCTION__))
;
21813 return SDValue();
21814 }
21815
21816 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"
) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21816, __PRETTY_FUNCTION__))
;
21817 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
21818
21819 auto MaskBits = [&](SDValue Src) {
21820 if (Mask.isAllOnesValue())
21821 return Src;
21822 EVT SrcVT = Src.getValueType();
21823 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
21824 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
21825 };
21826
21827 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
21828 if (VT.getSizeInBits() < 128) {
21829 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
21830 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
21831 return SDValue();
21832 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
21833 DAG.getBitcast(IntVT, MaskBits(V)),
21834 DAG.getConstant(0, DL, IntVT));
21835 }
21836
21837 // Quit if not splittable to 128/256-bit vector.
21838 if (!isPowerOf2_32(VT.getSizeInBits()))
21839 return SDValue();
21840
21841 // Split down to 128/256-bit vector.
21842 unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
21843 while (VT.getSizeInBits() > TestSize) {
21844 auto Split = DAG.SplitVector(V, DL);
21845 VT = Split.first.getValueType();
21846 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
21847 }
21848
21849 bool UsePTEST = Subtarget.hasSSE41();
21850 if (UsePTEST) {
21851 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
21852 V = DAG.getBitcast(TestVT, MaskBits(V));
21853 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
21854 }
21855
21856 // Without PTEST, a masked v2i64 or-reduction is not faster than
21857 // scalarization.
21858 if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
21859 return SDValue();
21860
21861 V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
21862 V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
21863 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
21864 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
21865 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
21866 DAG.getConstant(0xFFFF, DL, MVT::i32));
21867}
21868
21869// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
21870// CMP(MOVMSK(PCMPEQB(X,0))).
21871static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
21872 const SDLoc &DL,
21873 const X86Subtarget &Subtarget,
21874 SelectionDAG &DAG, SDValue &X86CC) {
21875 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"
) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21875, __PRETTY_FUNCTION__))
;
21876
21877 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
21878 return SDValue();
21879
21880 // Check whether we're masking/truncating an OR-reduction result, in which
21881 // case track the masked bits.
21882 APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
21883 switch (Op.getOpcode()) {
21884 case ISD::TRUNCATE: {
21885 SDValue Src = Op.getOperand(0);
21886 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
21887 Op.getScalarValueSizeInBits());
21888 Op = Src;
21889 break;
21890 }
21891 case ISD::AND: {
21892 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
21893 Mask = Cst->getAPIntValue();
21894 Op = Op.getOperand(0);
21895 }
21896 break;
21897 }
21898 }
21899
21900 SmallVector<SDValue, 8> VecIns;
21901 if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
21902 EVT VT = VecIns[0].getValueType();
21903 assert(llvm::all_of(VecIns,((llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType
(); }) && "Reduction source vector mismatch") ? static_cast
<void> (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21905, __PRETTY_FUNCTION__))
21904 [VT](SDValue V) { return VT == V.getValueType(); }) &&((llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType
(); }) && "Reduction source vector mismatch") ? static_cast
<void> (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21905, __PRETTY_FUNCTION__))
21905 "Reduction source vector mismatch")((llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType
(); }) && "Reduction source vector mismatch") ? static_cast
<void> (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21905, __PRETTY_FUNCTION__))
;
21906
21907 // Quit if less than 128-bits or not splittable to 128/256-bit vector.
21908 if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
21909 return SDValue();
21910
21911 // If more than one full vector is evaluated, OR them first before PTEST.
21912 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
21913 Slot += 2, e += 1) {
21914 // Each iteration will OR 2 nodes and append the result until there is
21915 // only 1 node left, i.e. the final OR'd value of all vectors.
21916 SDValue LHS = VecIns[Slot];
21917 SDValue RHS = VecIns[Slot + 1];
21918 VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
21919 }
21920
21921 X86::CondCode CCode;
21922 if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
21923 DAG, CCode)) {
21924 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
21925 return V;
21926 }
21927 }
21928
21929 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
21930 ISD::NodeType BinOp;
21931 if (SDValue Match =
21932 DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
21933 X86::CondCode CCode;
21934 if (SDValue V =
21935 LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
21936 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
21937 return V;
21938 }
21939 }
21940 }
21941
21942 return SDValue();
21943}
21944
21945/// return true if \c Op has a use that doesn't just read flags.
21946static bool hasNonFlagsUse(SDValue Op) {
21947 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
21948 ++UI) {
21949 SDNode *User = *UI;
21950 unsigned UOpNo = UI.getOperandNo();
21951 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
21952 // Look pass truncate.
21953 UOpNo = User->use_begin().getOperandNo();
21954 User = *User->use_begin();
21955 }
21956
21957 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
21958 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
21959 return true;
21960 }
21961 return false;
21962}
21963
21964// Transform to an x86-specific ALU node with flags if there is a chance of
21965// using an RMW op or only the flags are used. Otherwise, leave
21966// the node alone and emit a 'cmp' or 'test' instruction.
21967static bool isProfitableToUseFlagOp(SDValue Op) {
21968 for (SDNode *U : Op->uses())
21969 if (U->getOpcode() != ISD::CopyToReg &&
21970 U->getOpcode() != ISD::SETCC &&
21971 U->getOpcode() != ISD::STORE)
21972 return false;
21973
21974 return true;
21975}
21976
21977/// Emit nodes that will be selected as "test Op0,Op0", or something
21978/// equivalent.
21979static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
21980 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
21981 // CF and OF aren't always set the way we want. Determine which
21982 // of these we need.
21983 bool NeedCF = false;
21984 bool NeedOF = false;
21985 switch (X86CC) {
21986 default: break;
21987 case X86::COND_A: case X86::COND_AE:
21988 case X86::COND_B: case X86::COND_BE:
21989 NeedCF = true;
21990 break;
21991 case X86::COND_G: case X86::COND_GE:
21992 case X86::COND_L: case X86::COND_LE:
21993 case X86::COND_O: case X86::COND_NO: {
21994 // Check if we really need to set the
21995 // Overflow flag. If NoSignedWrap is present
21996 // that is not actually needed.
21997 switch (Op->getOpcode()) {
21998 case ISD::ADD:
21999 case ISD::SUB:
22000 case ISD::MUL:
22001 case ISD::SHL:
22002 if (Op.getNode()->getFlags().hasNoSignedWrap())
22003 break;
22004 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22005 default:
22006 NeedOF = true;
22007 break;
22008 }
22009 break;
22010 }
22011 }
22012 // See if we can use the EFLAGS value from the operand instead of
22013 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22014 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22015 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22016 // Emit a CMP with 0, which is the TEST pattern.
22017 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22018 DAG.getConstant(0, dl, Op.getValueType()));
22019 }
22020 unsigned Opcode = 0;
22021 unsigned NumOperands = 0;
22022
22023 SDValue ArithOp = Op;
22024
22025 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22026 // which may be the result of a CAST. We use the variable 'Op', which is the
22027 // non-casted variable when we check for possible users.
22028 switch (ArithOp.getOpcode()) {
22029 case ISD::AND:
22030 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22031 // because a TEST instruction will be better.
22032 if (!hasNonFlagsUse(Op))
22033 break;
22034
22035 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22036 case ISD::ADD:
22037 case ISD::SUB:
22038 case ISD::OR:
22039 case ISD::XOR:
22040 if (!isProfitableToUseFlagOp(Op))
22041 break;
22042
22043 // Otherwise use a regular EFLAGS-setting instruction.
22044 switch (ArithOp.getOpcode()) {
22045 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22045)
;
22046 case ISD::ADD: Opcode = X86ISD::ADD; break;
22047 case ISD::SUB: Opcode = X86ISD::SUB; break;
22048 case ISD::XOR: Opcode = X86ISD::XOR; break;
22049 case ISD::AND: Opcode = X86ISD::AND; break;
22050 case ISD::OR: Opcode = X86ISD::OR; break;
22051 }
22052
22053 NumOperands = 2;
22054 break;
22055 case X86ISD::ADD:
22056 case X86ISD::SUB:
22057 case X86ISD::OR:
22058 case X86ISD::XOR:
22059 case X86ISD::AND:
22060 return SDValue(Op.getNode(), 1);
22061 case ISD::SSUBO:
22062 case ISD::USUBO: {
22063 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22064 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22065 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22066 Op->getOperand(1)).getValue(1);
22067 }
22068 default:
22069 break;
22070 }
22071
22072 if (Opcode == 0) {
22073 // Emit a CMP with 0, which is the TEST pattern.
22074 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22075 DAG.getConstant(0, dl, Op.getValueType()));
22076 }
22077 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22078 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22079
22080 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22081 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22082 return SDValue(New.getNode(), 1);
22083}
22084
22085/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22086/// equivalent.
22087static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22088 const SDLoc &dl, SelectionDAG &DAG,
22089 const X86Subtarget &Subtarget) {
22090 if (isNullConstant(Op1))
22091 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22092
22093 EVT CmpVT = Op0.getValueType();
22094
22095 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32
|| CmpVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22096, __PRETTY_FUNCTION__))
22096 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32
|| CmpVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22096, __PRETTY_FUNCTION__))
;
22097
22098 // Only promote the compare up to I32 if it is a 16 bit operation
22099 // with an immediate. 16 bit immediates are to be avoided.
22100 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22101 !DAG.getMachineFunction().getFunction().hasMinSize()) {
22102 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22103 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22104 // Don't do this if the immediate can fit in 8-bits.
22105 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22106 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22107 unsigned ExtendOp =
22108 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22109 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22110 // For equality comparisons try to use SIGN_EXTEND if the input was
22111 // truncate from something with enough sign bits.
22112 if (Op0.getOpcode() == ISD::TRUNCATE) {
22113 SDValue In = Op0.getOperand(0);
22114 unsigned EffBits =
22115 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22116 if (EffBits <= 16)
22117 ExtendOp = ISD::SIGN_EXTEND;
22118 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22119 SDValue In = Op1.getOperand(0);
22120 unsigned EffBits =
22121 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22122 if (EffBits <= 16)
22123 ExtendOp = ISD::SIGN_EXTEND;
22124 }
22125 }
22126
22127 CmpVT = MVT::i32;
22128 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22129 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22130 }
22131 }
22132
22133 // Try to shrink i64 compares if the input has enough zero bits.
22134 // FIXME: Do this for non-constant compares for constant on LHS?
22135 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22136 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22137 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
22138 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22139 CmpVT = MVT::i32;
22140 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22141 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22142 }
22143
22144 // 0-x == y --> x+y == 0
22145 // 0-x != y --> x+y != 0
22146 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22147 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22148 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22149 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22150 return Add.getValue(1);
22151 }
22152
22153 // x == 0-y --> x+y == 0
22154 // x != 0-y --> x+y != 0
22155 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22156 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22157 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22158 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22159 return Add.getValue(1);
22160 }
22161
22162 // Use SUB instead of CMP to enable CSE between SUB and CMP.
22163 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22164 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22165 return Sub.getValue(1);
22166}
22167
22168/// Check if replacement of SQRT with RSQRT should be disabled.
22169bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22170 EVT VT = Op.getValueType();
22171
22172 // We never want to use both SQRT and RSQRT instructions for the same input.
22173 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22174 return false;
22175
22176 if (VT.isVector())
22177 return Subtarget.hasFastVectorFSQRT();
22178 return Subtarget.hasFastScalarFSQRT();
22179}
22180
22181/// The minimum architected relative accuracy is 2^-12. We need one
22182/// Newton-Raphson step to have a good float result (24 bits of precision).
22183SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22184 SelectionDAG &DAG, int Enabled,
22185 int &RefinementSteps,
22186 bool &UseOneConstNR,
22187 bool Reciprocal) const {
22188 EVT VT = Op.getValueType();
22189
22190 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22191 // It is likely not profitable to do this for f64 because a double-precision
22192 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22193 // instructions: convert to single, rsqrtss, convert back to double, refine
22194 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22195 // along with FMA, this could be a throughput win.
22196 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22197 // after legalize types.
22198 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22199 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22200 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22201 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22202 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22203 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22204 RefinementSteps = 1;
22205
22206 UseOneConstNR = false;
22207 // There is no FSQRT for 512-bits, but there is RSQRT14.
22208 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22209 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22210 }
22211 return SDValue();
22212}
22213
22214/// The minimum architected relative accuracy is 2^-12. We need one
22215/// Newton-Raphson step to have a good float result (24 bits of precision).
22216SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22217 int Enabled,
22218 int &RefinementSteps) const {
22219 EVT VT = Op.getValueType();
22220
22221 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22222 // It is likely not profitable to do this for f64 because a double-precision
22223 // reciprocal estimate with refinement on x86 prior to FMA requires
22224 // 15 instructions: convert to single, rcpss, convert back to double, refine
22225 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22226 // along with FMA, this could be a throughput win.
22227
22228 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22229 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22230 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22231 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22232 // Enable estimate codegen with 1 refinement step for vector division.
22233 // Scalar division estimates are disabled because they break too much
22234 // real-world code. These defaults are intended to match GCC behavior.
22235 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22236 return SDValue();
22237
22238 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22239 RefinementSteps = 1;
22240
22241 // There is no FSQRT for 512-bits, but there is RCP14.
22242 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22243 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22244 }
22245 return SDValue();
22246}
22247
22248/// If we have at least two divisions that use the same divisor, convert to
22249/// multiplication by a reciprocal. This may need to be adjusted for a given
22250/// CPU if a division's cost is not at least twice the cost of a multiplication.
22251/// This is because we still need one division to calculate the reciprocal and
22252/// then we need two multiplies by that reciprocal as replacements for the
22253/// original divisions.
22254unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22255 return 2;
22256}
22257
22258SDValue
22259X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22260 SelectionDAG &DAG,
22261 SmallVectorImpl<SDNode *> &Created) const {
22262 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
22263 if (isIntDivCheap(N->getValueType(0), Attr))
22264 return SDValue(N,0); // Lower SDIV as SDIV
22265
22266 assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&(((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
"Unexpected divisor!") ? static_cast<void> (0) : __assert_fail
("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22267, __PRETTY_FUNCTION__))
22267 "Unexpected divisor!")(((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
"Unexpected divisor!") ? static_cast<void> (0) : __assert_fail
("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22267, __PRETTY_FUNCTION__))
;
22268
22269 // Only perform this transform if CMOV is supported otherwise the select
22270 // below will become a branch.
22271 if (!Subtarget.hasCMov())
22272 return SDValue();
22273
22274 // fold (sdiv X, pow2)
22275 EVT VT = N->getValueType(0);
22276 // FIXME: Support i8.
22277 if (VT != MVT::i16 && VT != MVT::i32 &&
22278 !(Subtarget.is64Bit() && VT == MVT::i64))
22279 return SDValue();
22280
22281 unsigned Lg2 = Divisor.countTrailingZeros();
22282
22283 // If the divisor is 2 or -2, the default expansion is better.
22284 if (Lg2 == 1)
22285 return SDValue();
22286
22287 SDLoc DL(N);
22288 SDValue N0 = N->getOperand(0);
22289 SDValue Zero = DAG.getConstant(0, DL, VT);
22290 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
22291 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
22292
22293 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
22294 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
22295 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
22296 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
22297
22298 Created.push_back(Cmp.getNode());
22299 Created.push_back(Add.getNode());
22300 Created.push_back(CMov.getNode());
22301
22302 // Divide by pow2.
22303 SDValue SRA =
22304 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
22305
22306 // If we're dividing by a positive value, we're done. Otherwise, we must
22307 // negate the result.
22308 if (Divisor.isNonNegative())
22309 return SRA;
22310
22311 Created.push_back(SRA.getNode());
22312 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
22313}
22314
22315/// Result of 'and' is compared against zero. Change to a BT node if possible.
22316/// Returns the BT node and the condition code needed to use it.
22317static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
22318 const SDLoc &dl, SelectionDAG &DAG,
22319 SDValue &X86CC) {
22320 assert(And.getOpcode() == ISD::AND && "Expected AND node!")((And.getOpcode() == ISD::AND && "Expected AND node!"
) ? static_cast<void> (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22320, __PRETTY_FUNCTION__))
;
22321 SDValue Op0 = And.getOperand(0);
22322 SDValue Op1 = And.getOperand(1);
22323 if (Op0.getOpcode() == ISD::TRUNCATE)
22324 Op0 = Op0.getOperand(0);
22325 if (Op1.getOpcode() == ISD::TRUNCATE)
22326 Op1 = Op1.getOperand(0);
22327
22328 SDValue Src, BitNo;
22329 if (Op1.getOpcode() == ISD::SHL)
22330 std::swap(Op0, Op1);
22331 if (Op0.getOpcode() == ISD::SHL) {
22332 if (isOneConstant(Op0.getOperand(0))) {
22333 // If we looked past a truncate, check that it's only truncating away
22334 // known zeros.
22335 unsigned BitWidth = Op0.getValueSizeInBits();
22336 unsigned AndBitWidth = And.getValueSizeInBits();
22337 if (BitWidth > AndBitWidth) {
22338 KnownBits Known = DAG.computeKnownBits(Op0);
22339 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22340 return SDValue();
22341 }
22342 Src = Op1;
22343 BitNo = Op0.getOperand(1);
22344 }
22345 } else if (Op1.getOpcode() == ISD::Constant) {
22346 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22347 uint64_t AndRHSVal = AndRHS->getZExtValue();
22348 SDValue AndLHS = Op0;
22349
22350 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22351 Src = AndLHS.getOperand(0);
22352 BitNo = AndLHS.getOperand(1);
22353 } else {
22354 // Use BT if the immediate can't be encoded in a TEST instruction or we
22355 // are optimizing for size and the immedaite won't fit in a byte.
22356 bool OptForSize = DAG.shouldOptForSize();
22357 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22358 isPowerOf2_64(AndRHSVal)) {
22359 Src = AndLHS;
22360 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22361 Src.getValueType());
22362 }
22363 }
22364 }
22365
22366 // No patterns found, give up.
22367 if (!Src.getNode())
22368 return SDValue();
22369
22370 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22371 // instruction. Since the shift amount is in-range-or-undefined, we know
22372 // that doing a bittest on the i32 value is ok. We extend to i32 because
22373 // the encoding for the i16 version is larger than the i32 version.
22374 // Also promote i16 to i32 for performance / code size reason.
22375 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
22376 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
22377
22378 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22379 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22380 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22381 // known to be zero.
22382 if (Src.getValueType() == MVT::i64 &&
22383 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22384 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
22385
22386 // If the operand types disagree, extend the shift amount to match. Since
22387 // BT ignores high bits (like shifts) we can use anyextend.
22388 if (Src.getValueType() != BitNo.getValueType())
22389 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
22390
22391 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
22392 dl, MVT::i8);
22393 return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
22394}
22395
22396/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
22397/// CMPs.
22398static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
22399 SDValue &Op1, bool &IsAlwaysSignaling) {
22400 unsigned SSECC;
22401 bool Swap = false;
22402
22403 // SSE Condition code mapping:
22404 // 0 - EQ
22405 // 1 - LT
22406 // 2 - LE
22407 // 3 - UNORD
22408 // 4 - NEQ
22409 // 5 - NLT
22410 // 6 - NLE
22411 // 7 - ORD
22412 switch (SetCCOpcode) {
22413 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22413)
;
22414 case ISD::SETOEQ:
22415 case ISD::SETEQ: SSECC = 0; break;
22416 case ISD::SETOGT:
22417 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22418 case ISD::SETLT:
22419 case ISD::SETOLT: SSECC = 1; break;
22420 case ISD::SETOGE:
22421 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22422 case ISD::SETLE:
22423 case ISD::SETOLE: SSECC = 2; break;
22424 case ISD::SETUO: SSECC = 3; break;
22425 case ISD::SETUNE:
22426 case ISD::SETNE: SSECC = 4; break;
22427 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22428 case ISD::SETUGE: SSECC = 5; break;
22429 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22430 case ISD::SETUGT: SSECC = 6; break;
22431 case ISD::SETO: SSECC = 7; break;
22432 case ISD::SETUEQ: SSECC = 8; break;
22433 case ISD::SETONE: SSECC = 12; break;
22434 }
22435 if (Swap)
22436 std::swap(Op0, Op1);
22437
22438 switch (SetCCOpcode) {
22439 default:
22440 IsAlwaysSignaling = true;
22441 break;
22442 case ISD::SETEQ:
22443 case ISD::SETOEQ:
22444 case ISD::SETUEQ:
22445 case ISD::SETNE:
22446 case ISD::SETONE:
22447 case ISD::SETUNE:
22448 case ISD::SETO:
22449 case ISD::SETUO:
22450 IsAlwaysSignaling = false;
22451 break;
22452 }
22453
22454 return SSECC;
22455}
22456
22457/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
22458/// concatenate the result back.
22459static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) {
22460 EVT VT = Op.getValueType();
22461
22462 assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation")((Op.getOpcode() == ISD::SETCC && "Unsupported operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::SETCC && \"Unsupported operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22462, __PRETTY_FUNCTION__))
;
22463 assert(Op.getOperand(0).getValueType().isInteger() &&((Op.getOperand(0).getValueType().isInteger() && VT ==
Op.getOperand(0).getValueType() && "Unsupported VTs!"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType().isInteger() && VT == Op.getOperand(0).getValueType() && \"Unsupported VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22464, __PRETTY_FUNCTION__))
22464 VT == Op.getOperand(0).getValueType() && "Unsupported VTs!")((Op.getOperand(0).getValueType().isInteger() && VT ==
Op.getOperand(0).getValueType() && "Unsupported VTs!"
) ? static_cast<void> (0) : __assert_fail ("Op.getOperand(0).getValueType().isInteger() && VT == Op.getOperand(0).getValueType() && \"Unsupported VTs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22464, __PRETTY_FUNCTION__))
;
22465
22466 SDLoc dl(Op);
22467 SDValue CC = Op.getOperand(2);
22468
22469 // Extract the LHS Lo/Hi vectors
22470 SDValue LHS1, LHS2;
22471 std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
22472
22473 // Extract the RHS Lo/Hi vectors
22474 SDValue RHS1, RHS2;
22475 std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
22476
22477 // Issue the operation on the smaller types and concatenate the result back
22478 EVT LoVT, HiVT;
22479 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22480 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22481 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
22482 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
22483}
22484
22485static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
22486
22487 SDValue Op0 = Op.getOperand(0);
22488 SDValue Op1 = Op.getOperand(1);
22489 SDValue CC = Op.getOperand(2);
22490 MVT VT = Op.getSimpleValueType();
22491 SDLoc dl(Op);
22492
22493 assert(VT.getVectorElementType() == MVT::i1 &&((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22494, __PRETTY_FUNCTION__))
22494 "Cannot set masked compare for this operation")((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22494, __PRETTY_FUNCTION__))
;
22495
22496 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
22497
22498 // Prefer SETGT over SETLT.
22499 if (SetCCOpcode == ISD::SETLT) {
22500 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
22501 std::swap(Op0, Op1);
22502 }
22503
22504 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
22505}
22506
22507/// Given a buildvector constant, return a new vector constant with each element
22508/// incremented or decremented. If incrementing or decrementing would result in
22509/// unsigned overflow or underflow or this is not a simple vector constant,
22510/// return an empty value.
22511static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
22512 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
22513 if (!BV)
22514 return SDValue();
22515
22516 MVT VT = V.getSimpleValueType();
22517 MVT EltVT = VT.getVectorElementType();
22518 unsigned NumElts = VT.getVectorNumElements();
22519 SmallVector<SDValue, 8> NewVecC;
22520 SDLoc DL(V);
22521 for (unsigned i = 0; i < NumElts; ++i) {
22522 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
22523 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
22524 return SDValue();
22525
22526 // Avoid overflow/underflow.
22527 const APInt &EltC = Elt->getAPIntValue();
22528 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
22529 return SDValue();
22530
22531 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
22532 }
22533
22534 return DAG.getBuildVector(VT, DL, NewVecC);
22535}
22536
22537/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
22538/// Op0 u<= Op1:
22539/// t = psubus Op0, Op1
22540/// pcmpeq t, <0..0>
22541static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
22542 ISD::CondCode Cond, const SDLoc &dl,
22543 const X86Subtarget &Subtarget,
22544 SelectionDAG &DAG) {
22545 if (!Subtarget.hasSSE2())
22546 return SDValue();
22547
22548 MVT VET = VT.getVectorElementType();
22549 if (VET != MVT::i8 && VET != MVT::i16)
22550 return SDValue();
22551
22552 switch (Cond) {
22553 default:
22554 return SDValue();
22555 case ISD::SETULT: {
22556 // If the comparison is against a constant we can turn this into a
22557 // setule. With psubus, setule does not require a swap. This is
22558 // beneficial because the constant in the register is no longer
22559 // destructed as the destination so it can be hoisted out of a loop.
22560 // Only do this pre-AVX since vpcmp* is no longer destructive.
22561 if (Subtarget.hasAVX())
22562 return SDValue();
22563 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
22564 if (!ULEOp1)
22565 return SDValue();
22566 Op1 = ULEOp1;
22567 break;
22568 }
22569 case ISD::SETUGT: {
22570 // If the comparison is against a constant, we can turn this into a setuge.
22571 // This is beneficial because materializing a constant 0 for the PCMPEQ is
22572 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
22573 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
22574 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
22575 if (!UGEOp1)
22576 return SDValue();
22577 Op1 = Op0;
22578 Op0 = UGEOp1;
22579 break;
22580 }
22581 // Psubus is better than flip-sign because it requires no inversion.
22582 case ISD::SETUGE:
22583 std::swap(Op0, Op1);
22584 break;
22585 case ISD::SETULE:
22586 break;
22587 }
22588
22589 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
22590 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
22591 DAG.getConstant(0, dl, VT));
22592}
22593
22594static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
22595 SelectionDAG &DAG) {
22596 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
22597 Op.getOpcode() == ISD::STRICT_FSETCCS;
22598 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
22599 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
22600 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
22601 MVT VT = Op->getSimpleValueType(0);
22602 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
22603 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
22604 SDLoc dl(Op);
22605
22606 if (isFP) {
22607#ifndef NDEBUG
22608 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
22609 assert(EltVT == MVT::f32 || EltVT == MVT::f64)((EltVT == MVT::f32 || EltVT == MVT::f64) ? static_cast<void
> (0) : __assert_fail ("EltVT == MVT::f32 || EltVT == MVT::f64"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22609, __PRETTY_FUNCTION__))
;
22610#endif
22611
22612 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
22613 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22614
22615 // If we have a strict compare with a vXi1 result and the input is 128/256
22616 // bits we can't use a masked compare unless we have VLX. If we use a wider
22617 // compare like we do for non-strict, we might trigger spurious exceptions
22618 // from the upper elements. Instead emit a AVX compare and convert to mask.
22619 unsigned Opc;
22620 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
22621 (!IsStrict || Subtarget.hasVLX() ||
22622 Op0.getSimpleValueType().is512BitVector())) {
22623 assert(VT.getVectorNumElements() <= 16)((VT.getVectorNumElements() <= 16) ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() <= 16", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22623, __PRETTY_FUNCTION__))
;
22624 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
22625 } else {
22626 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
22627 // The SSE/AVX packed FP comparison nodes are defined with a
22628 // floating-point vector result that matches the operand type. This allows
22629 // them to work with an SSE1 target (integer vector types are not legal).
22630 VT = Op0.getSimpleValueType();
22631 }
22632
22633 SDValue Cmp;
22634 bool IsAlwaysSignaling;
22635 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
22636 if (!Subtarget.hasAVX()) {
22637 // TODO: We could use following steps to handle a quiet compare with
22638 // signaling encodings.
22639 // 1. Get ordered masks from a quiet ISD::SETO
22640 // 2. Use the masks to mask potential unordered elements in operand A, B
22641 // 3. Get the compare results of masked A, B
22642 // 4. Calculating final result using the mask and result from 3
22643 // But currently, we just fall back to scalar operations.
22644 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
22645 return SDValue();
22646
22647 // Insert an extra signaling instruction to raise exception.
22648 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
22649 SDValue SignalCmp = DAG.getNode(
22650 Opc, dl, {VT, MVT::Other},
22651 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
22652 // FIXME: It seems we need to update the flags of all new strict nodes.
22653 // Otherwise, mayRaiseFPException in MI will return false due to
22654 // NoFPExcept = false by default. However, I didn't find it in other
22655 // patches.
22656 SignalCmp->setFlags(Op->getFlags());
22657 Chain = SignalCmp.getValue(1);
22658 }
22659
22660 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
22661 // emit two comparisons and a logic op to tie them together.
22662 if (SSECC >= 8) {
22663 // LLVM predicate is SETUEQ or SETONE.
22664 unsigned CC0, CC1;
22665 unsigned CombineOpc;
22666 if (Cond == ISD::SETUEQ) {
22667 CC0 = 3; // UNORD
22668 CC1 = 0; // EQ
22669 CombineOpc = X86ISD::FOR;
22670 } else {
22671 assert(Cond == ISD::SETONE)((Cond == ISD::SETONE) ? static_cast<void> (0) : __assert_fail
("Cond == ISD::SETONE", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22671, __PRETTY_FUNCTION__))
;
22672 CC0 = 7; // ORD
22673 CC1 = 4; // NEQ
22674 CombineOpc = X86ISD::FAND;
22675 }
22676
22677 SDValue Cmp0, Cmp1;
22678 if (IsStrict) {
22679 Cmp0 = DAG.getNode(
22680 Opc, dl, {VT, MVT::Other},
22681 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
22682 Cmp1 = DAG.getNode(
22683 Opc, dl, {VT, MVT::Other},
22684 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
22685 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
22686 Cmp1.getValue(1));
22687 } else {
22688 Cmp0 = DAG.getNode(
22689 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
22690 Cmp1 = DAG.getNode(
22691 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
22692 }
22693 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
22694 } else {
22695 if (IsStrict) {
22696 Cmp = DAG.getNode(
22697 Opc, dl, {VT, MVT::Other},
22698 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
22699 Chain = Cmp.getValue(1);
22700 } else
22701 Cmp = DAG.getNode(
22702 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
22703 }
22704 } else {
22705 // Handle all other FP comparisons here.
22706 if (IsStrict) {
22707 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
22708 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
22709 Cmp = DAG.getNode(
22710 Opc, dl, {VT, MVT::Other},
22711 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
22712 Chain = Cmp.getValue(1);
22713 } else
22714 Cmp = DAG.getNode(
22715 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
22716 }
22717
22718 if (VT.getFixedSizeInBits() >
22719 Op.getSimpleValueType().getFixedSizeInBits()) {
22720 // We emitted a compare with an XMM/YMM result. Finish converting to a
22721 // mask register using a vptestm.
22722 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
22723 Cmp = DAG.getBitcast(CastVT, Cmp);
22724 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
22725 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
22726 } else {
22727 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
22728 // the result type of SETCC. The bitcast is expected to be optimized
22729 // away during combining/isel.
22730 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
22731 }
22732
22733 if (IsStrict)
22734 return DAG.getMergeValues({Cmp, Chain}, dl);
22735
22736 return Cmp;
22737 }
22738
22739 assert(!IsStrict && "Strict SETCC only handles FP operands.")((!IsStrict && "Strict SETCC only handles FP operands."
) ? static_cast<void> (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22739, __PRETTY_FUNCTION__))
;
22740
22741 MVT VTOp0 = Op0.getSimpleValueType();
22742 (void)VTOp0;
22743 assert(VTOp0 == Op1.getSimpleValueType() &&((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"
) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22744, __PRETTY_FUNCTION__))
22744 "Expected operands with same type!")((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"
) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22744, __PRETTY_FUNCTION__))
;
22745 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&((VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22746, __PRETTY_FUNCTION__))
22746 "Invalid number of packed elements for source and destination!")((VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22746, __PRETTY_FUNCTION__))
;
22747
22748 // The non-AVX512 code below works under the assumption that source and
22749 // destination types are the same.
22750 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22751, __PRETTY_FUNCTION__))
22751 "Value types for source and destination must be the same!")(((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22751, __PRETTY_FUNCTION__))
;
22752
22753 // The result is boolean, but operands are int/float
22754 if (VT.getVectorElementType() == MVT::i1) {
22755 // In AVX-512 architecture setcc returns mask with i1 elements,
22756 // But there is no compare instruction for i8 and i16 elements in KNL.
22757 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()
) && "Unexpected operand type") ? static_cast<void
> (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22758, __PRETTY_FUNCTION__))
22758 "Unexpected operand type")(((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()
) && "Unexpected operand type") ? static_cast<void
> (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22758, __PRETTY_FUNCTION__))
;
22759 return LowerIntVSETCC_AVX512(Op, DAG);
22760 }
22761
22762 // Lower using XOP integer comparisons.
22763 if (VT.is128BitVector() && Subtarget.hasXOP()) {
22764 // Translate compare code to XOP PCOM compare mode.
22765 unsigned CmpMode = 0;
22766 switch (Cond) {
22767 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22767)
;
22768 case ISD::SETULT:
22769 case ISD::SETLT: CmpMode = 0x00; break;
22770 case ISD::SETULE:
22771 case ISD::SETLE: CmpMode = 0x01; break;
22772 case ISD::SETUGT:
22773 case ISD::SETGT: CmpMode = 0x02; break;
22774 case ISD::SETUGE:
22775 case ISD::SETGE: CmpMode = 0x03; break;
22776 case ISD::SETEQ: CmpMode = 0x04; break;
22777 case ISD::SETNE: CmpMode = 0x05; break;
22778 }
22779
22780 // Are we comparing unsigned or signed integers?
22781 unsigned Opc =
22782 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
22783
22784 return DAG.getNode(Opc, dl, VT, Op0, Op1,
22785 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
22786 }
22787
22788 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
22789 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
22790 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
22791 SDValue BC0 = peekThroughBitcasts(Op0);
22792 if (BC0.getOpcode() == ISD::AND) {
22793 APInt UndefElts;
22794 SmallVector<APInt, 64> EltBits;
22795 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
22796 VT.getScalarSizeInBits(), UndefElts,
22797 EltBits, false, false)) {
22798 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
22799 Cond = ISD::SETEQ;
22800 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
22801 }
22802 }
22803 }
22804 }
22805
22806 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
22807 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
22808 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
22809 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
22810 if (C1 && C1->getAPIntValue().isPowerOf2()) {
22811 unsigned BitWidth = VT.getScalarSizeInBits();
22812 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
22813
22814 SDValue Result = Op0.getOperand(0);
22815 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
22816 DAG.getConstant(ShiftAmt, dl, VT));
22817 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
22818 DAG.getConstant(BitWidth - 1, dl, VT));
22819 return Result;
22820 }
22821 }
22822
22823 // Break 256-bit integer vector compare into smaller ones.
22824 if (VT.is256BitVector() && !Subtarget.hasInt256())
22825 return splitIntVSETCC(Op, DAG);
22826
22827 if (VT == MVT::v32i16 || VT == MVT::v64i8) {
22828 assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!")((!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.hasBWI() && \"Unexpected VT with AVX512BW!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22828, __PRETTY_FUNCTION__))
;
22829 return splitIntVSETCC(Op, DAG);
22830 }
22831
22832 // If this is a SETNE against the signed minimum value, change it to SETGT.
22833 // If this is a SETNE against the signed maximum value, change it to SETLT.
22834 // which will be swapped to SETGT.
22835 // Otherwise we use PCMPEQ+invert.
22836 APInt ConstValue;
22837 if (Cond == ISD::SETNE &&
22838 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
22839 if (ConstValue.isMinSignedValue())
22840 Cond = ISD::SETGT;
22841 else if (ConstValue.isMaxSignedValue())
22842 Cond = ISD::SETLT;
22843 }
22844
22845 // If both operands are known non-negative, then an unsigned compare is the
22846 // same as a signed compare and there's no need to flip signbits.
22847 // TODO: We could check for more general simplifications here since we're
22848 // computing known bits.
22849 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
22850 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
22851
22852 // Special case: Use min/max operations for unsigned compares.
22853 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22854 if (ISD::isUnsignedIntSetCC(Cond) &&
22855 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
22856 TLI.isOperationLegal(ISD::UMIN, VT)) {
22857 // If we have a constant operand, increment/decrement it and change the
22858 // condition to avoid an invert.
22859 if (Cond == ISD::SETUGT) {
22860 // X > C --> X >= (C+1) --> X == umax(X, C+1)
22861 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
22862 Op1 = UGTOp1;
22863 Cond = ISD::SETUGE;
22864 }
22865 }
22866 if (Cond == ISD::SETULT) {
22867 // X < C --> X <= (C-1) --> X == umin(X, C-1)
22868 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
22869 Op1 = ULTOp1;
22870 Cond = ISD::SETULE;
22871 }
22872 }
22873 bool Invert = false;
22874 unsigned Opc;
22875 switch (Cond) {
22876 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22876)
;
22877 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22878 case ISD::SETULE: Opc = ISD::UMIN; break;
22879 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22880 case ISD::SETUGE: Opc = ISD::UMAX; break;
22881 }
22882
22883 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
22884 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
22885
22886 // If the logical-not of the result is required, perform that now.
22887 if (Invert)
22888 Result = DAG.getNOT(dl, Result, VT);
22889
22890 return Result;
22891 }
22892
22893 // Try to use SUBUS and PCMPEQ.
22894 if (FlipSigns)
22895 if (SDValue V =
22896 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
22897 return V;
22898
22899 // We are handling one of the integer comparisons here. Since SSE only has
22900 // GT and EQ comparisons for integer, swapping operands and multiple
22901 // operations may be required for some comparisons.
22902 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
22903 : X86ISD::PCMPGT;
22904 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
22905 Cond == ISD::SETGE || Cond == ISD::SETUGE;
22906 bool Invert = Cond == ISD::SETNE ||
22907 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
22908
22909 if (Swap)
22910 std::swap(Op0, Op1);
22911
22912 // Check that the operation in question is available (most are plain SSE2,
22913 // but PCMPGTQ and PCMPEQQ have different requirements).
22914 if (VT == MVT::v2i64) {
22915 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
22916 assert(Subtarget.hasSSE2() && "Don't know how to lower!")((Subtarget.hasSSE2() && "Don't know how to lower!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22916, __PRETTY_FUNCTION__))
;
22917
22918 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
22919 // the odd elements over the even elements.
22920 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
22921 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
22922 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
22923
22924 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
22925 static const int MaskHi[] = { 1, 1, 3, 3 };
22926 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
22927
22928 return DAG.getBitcast(VT, Result);
22929 }
22930
22931 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
22932 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
22933 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
22934
22935 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
22936 static const int MaskHi[] = { 1, 1, 3, 3 };
22937 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
22938
22939 return DAG.getBitcast(VT, Result);
22940 }
22941
22942 // Since SSE has no unsigned integer comparisons, we need to flip the sign
22943 // bits of the inputs before performing those operations. The lower
22944 // compare is always unsigned.
22945 SDValue SB;
22946 if (FlipSigns) {
22947 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
22948 } else {
22949 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
22950 }
22951 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
22952 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
22953
22954 // Cast everything to the right type.
22955 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
22956 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
22957
22958 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
22959 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
22960 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
22961
22962 // Create masks for only the low parts/high parts of the 64 bit integers.
22963 static const int MaskHi[] = { 1, 1, 3, 3 };
22964 static const int MaskLo[] = { 0, 0, 2, 2 };
22965 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
22966 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
22967 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
22968
22969 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
22970 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
22971
22972 if (Invert)
22973 Result = DAG.getNOT(dl, Result, MVT::v4i32);
22974
22975 return DAG.getBitcast(VT, Result);
22976 }
22977
22978 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
22979 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
22980 // pcmpeqd + pshufd + pand.
22981 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")((Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22981, __PRETTY_FUNCTION__))
;
22982
22983 // First cast everything to the right type.
22984 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
22985 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
22986
22987 // Do the compare.
22988 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
22989
22990 // Make sure the lower and upper halves are both all-ones.
22991 static const int Mask[] = { 1, 0, 3, 2 };
22992 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
22993 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
22994
22995 if (Invert)
22996 Result = DAG.getNOT(dl, Result, MVT::v4i32);
22997
22998 return DAG.getBitcast(VT, Result);
22999 }
23000 }
23001
23002 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23003 // bits of the inputs before performing those operations.
23004 if (FlipSigns) {
23005 MVT EltVT = VT.getVectorElementType();
23006 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23007 VT);
23008 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23009 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23010 }
23011
23012 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23013
23014 // If the logical-not of the result is required, perform that now.
23015 if (Invert)
23016 Result = DAG.getNOT(dl, Result, VT);
23017
23018 return Result;
23019}
23020
23021// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23022static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23023 const SDLoc &dl, SelectionDAG &DAG,
23024 const X86Subtarget &Subtarget,
23025 SDValue &X86CC) {
23026 // Only support equality comparisons.
23027 if (CC != ISD::SETEQ && CC != ISD::SETNE)
23028 return SDValue();
23029
23030 // Must be a bitcast from vXi1.
23031 if (Op0.getOpcode() != ISD::BITCAST)
23032 return SDValue();
23033
23034 Op0 = Op0.getOperand(0);
23035 MVT VT = Op0.getSimpleValueType();
23036 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23037 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23038 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23039 return SDValue();
23040
23041 X86::CondCode X86Cond;
23042 if (isNullConstant(Op1)) {
23043 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23044 } else if (isAllOnesConstant(Op1)) {
23045 // C flag is set for all ones.
23046 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23047 } else
23048 return SDValue();
23049
23050 // If the input is an AND, we can combine it's operands into the KTEST.
23051 bool KTestable = false;
23052 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23053 KTestable = true;
23054 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23055 KTestable = true;
23056 if (!isNullConstant(Op1))
23057 KTestable = false;
23058 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23059 SDValue LHS = Op0.getOperand(0);
23060 SDValue RHS = Op0.getOperand(1);
23061 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23062 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23063 }
23064
23065 // If the input is an OR, we can combine it's operands into the KORTEST.
23066 SDValue LHS = Op0;
23067 SDValue RHS = Op0;
23068 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23069 LHS = Op0.getOperand(0);
23070 RHS = Op0.getOperand(1);
23071 }
23072
23073 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23074 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23075}
23076
23077/// Emit flags for the given setcc condition and operands. Also returns the
23078/// corresponding X86 condition code constant in X86CC.
23079SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23080 ISD::CondCode CC, const SDLoc &dl,
23081 SelectionDAG &DAG,
23082 SDValue &X86CC) const {
23083 // Optimize to BT if possible.
23084 // Lower (X & (1 << N)) == 0 to BT(X, N).
23085 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23086 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23087 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
23088 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23089 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
23090 return BT;
23091 }
23092
23093 // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
23094 // TODO: We could do AND tree with all 1s as well by using the C flag.
23095 if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
23096 if (SDValue CmpZ =
23097 MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
23098 return CmpZ;
23099
23100 // Try to lower using KORTEST or KTEST.
23101 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23102 return Test;
23103
23104 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
23105 // these.
23106 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
23107 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23108 // If the input is a setcc, then reuse the input setcc or use a new one with
23109 // the inverted condition.
23110 if (Op0.getOpcode() == X86ISD::SETCC) {
23111 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23112
23113 X86CC = Op0.getOperand(0);
23114 if (Invert) {
23115 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23116 CCode = X86::GetOppositeBranchCondition(CCode);
23117 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23118 }
23119
23120 return Op0.getOperand(1);
23121 }
23122 }
23123
23124 // Try to use the carry flag from the add in place of an separate CMP for:
23125 // (seteq (add X, -1), -1). Similar for setne.
23126 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23127 Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23128 if (isProfitableToUseFlagOp(Op0)) {
23129 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23130
23131 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23132 Op0.getOperand(1));
23133 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23134 X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23135 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23136 return SDValue(New.getNode(), 1);
23137 }
23138 }
23139
23140 X86::CondCode CondCode =
23141 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23142 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")((CondCode != X86::COND_INVALID && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23142, __PRETTY_FUNCTION__))
;
23143
23144 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23145 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23146 return EFLAGS;
23147}
23148
23149SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23150
23151 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23152 Op.getOpcode() == ISD::STRICT_FSETCCS;
23153 MVT VT = Op->getSimpleValueType(0);
23154
23155 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23156
23157 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")((VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23157, __PRETTY_FUNCTION__))
;
23158 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23159 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23160 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23161 SDLoc dl(Op);
23162 ISD::CondCode CC =
23163 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23164
23165 // Handle f128 first, since one possible outcome is a normal integer
23166 // comparison which gets handled by emitFlagsForSetcc.
23167 if (Op0.getValueType() == MVT::f128) {
23168 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23169 Op.getOpcode() == ISD::STRICT_FSETCCS);
23170
23171 // If softenSetCCOperands returned a scalar, use it.
23172 if (!Op1.getNode()) {
23173 assert(Op0.getValueType() == Op.getValueType() &&((Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"
) ? static_cast<void> (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23174, __PRETTY_FUNCTION__))
23174 "Unexpected setcc expansion!")((Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"
) ? static_cast<void> (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23174, __PRETTY_FUNCTION__))
;
23175 if (IsStrict)
23176 return DAG.getMergeValues({Op0, Chain}, dl);
23177 return Op0;
23178 }
23179 }
23180
23181 if (Op0.getSimpleValueType().isInteger()) {
23182 SDValue X86CC;
23183 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23184 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23185 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23186 }
23187
23188 // Handle floating point.
23189 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23190 if (CondCode == X86::COND_INVALID)
23191 return SDValue();
23192
23193 SDValue EFLAGS;
23194 if (IsStrict) {
23195 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23196 EFLAGS =
23197 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
23198 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23199 Chain = EFLAGS.getValue(1);
23200 } else {
23201 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23202 }
23203
23204 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23205 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23206 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23207}
23208
23209SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23210 SDValue LHS = Op.getOperand(0);
23211 SDValue RHS = Op.getOperand(1);
23212 SDValue Carry = Op.getOperand(2);
23213 SDValue Cond = Op.getOperand(3);
23214 SDLoc DL(Op);
23215
23216 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")((LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."
) ? static_cast<void> (0) : __assert_fail ("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23216, __PRETTY_FUNCTION__))
;
23217 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23218
23219 // Recreate the carry if needed.
23220 EVT CarryVT = Carry.getValueType();
23221 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23222 Carry, DAG.getAllOnesConstant(DL, CarryVT));
23223
23224 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23225 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23226 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23227}
23228
23229// This function returns three things: the arithmetic computation itself
23230// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
23231// flag and the condition code define the case in which the arithmetic
23232// computation overflows.
23233static std::pair<SDValue, SDValue>
23234getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
23235 assert(Op.getResNo() == 0 && "Unexpected result number!")((Op.getResNo() == 0 && "Unexpected result number!") ?
static_cast<void> (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23235, __PRETTY_FUNCTION__))
;
23236 SDValue Value, Overflow;
23237 SDValue LHS = Op.getOperand(0);
23238 SDValue RHS = Op.getOperand(1);
23239 unsigned BaseOp = 0;
23240 SDLoc DL(Op);
23241 switch (Op.getOpcode()) {
23242 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23242)
;
23243 case ISD::SADDO:
23244 BaseOp = X86ISD::ADD;
23245 Cond = X86::COND_O;
23246 break;
23247 case ISD::UADDO:
23248 BaseOp = X86ISD::ADD;
23249 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
23250 break;
23251 case ISD::SSUBO:
23252 BaseOp = X86ISD::SUB;
23253 Cond = X86::COND_O;
23254 break;
23255 case ISD::USUBO:
23256 BaseOp = X86ISD::SUB;
23257 Cond = X86::COND_B;
23258 break;
23259 case ISD::SMULO:
23260 BaseOp = X86ISD::SMUL;
23261 Cond = X86::COND_O;
23262 break;
23263 case ISD::UMULO:
23264 BaseOp = X86ISD::UMUL;
23265 Cond = X86::COND_O;
23266 break;
23267 }
23268
23269 if (BaseOp) {
23270 // Also sets EFLAGS.
23271 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23272 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23273 Overflow = Value.getValue(1);
23274 }
23275
23276 return std::make_pair(Value, Overflow);
23277}
23278
23279static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23280 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23281 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23282 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23283 // has only one use.
23284 SDLoc DL(Op);
23285 X86::CondCode Cond;
23286 SDValue Value, Overflow;
23287 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23288
23289 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23290 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")((Op->getValueType(1) == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23290, __PRETTY_FUNCTION__))
;
23291 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23292}
23293
23294/// Return true if opcode is a X86 logical comparison.
23295static bool isX86LogicalCmp(SDValue Op) {
23296 unsigned Opc = Op.getOpcode();
23297 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23298 Opc == X86ISD::FCMP)
23299 return true;
23300 if (Op.getResNo() == 1 &&
23301 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23302 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23303 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23304 return true;
23305
23306 return false;
23307}
23308
23309static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
23310 if (V.getOpcode() != ISD::TRUNCATE)
23311 return false;
23312
23313 SDValue VOp0 = V.getOperand(0);
23314 unsigned InBits = VOp0.getValueSizeInBits();
23315 unsigned Bits = V.getValueSizeInBits();
23316 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23317}
23318
23319SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23320 bool AddTest = true;
23321 SDValue Cond = Op.getOperand(0);
23322 SDValue Op1 = Op.getOperand(1);
23323 SDValue Op2 = Op.getOperand(2);
23324 SDLoc DL(Op);
23325 MVT VT = Op1.getSimpleValueType();
23326 SDValue CC;
23327
23328 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
23329 // are available or VBLENDV if AVX is available.
23330 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
23331 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
23332 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
23333 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
23334 bool IsAlwaysSignaling;
23335 unsigned SSECC =
23336 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
23337 CondOp0, CondOp1, IsAlwaysSignaling);
23338
23339 if (Subtarget.hasAVX512()) {
23340 SDValue Cmp =
23341 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
23342 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23343 assert(!VT.isVector() && "Not a scalar type?")((!VT.isVector() && "Not a scalar type?") ? static_cast
<void> (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23343, __PRETTY_FUNCTION__))
;
23344 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23345 }
23346
23347 if (SSECC < 8 || Subtarget.hasAVX()) {
23348 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
23349 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23350
23351 // If we have AVX, we can use a variable vector select (VBLENDV) instead
23352 // of 3 logic instructions for size savings and potentially speed.
23353 // Unfortunately, there is no scalar form of VBLENDV.
23354
23355 // If either operand is a +0.0 constant, don't try this. We can expect to
23356 // optimize away at least one of the logic instructions later in that
23357 // case, so that sequence would be faster than a variable blend.
23358
23359 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
23360 // uses XMM0 as the selection register. That may need just as many
23361 // instructions as the AND/ANDN/OR sequence due to register moves, so
23362 // don't bother.
23363 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
23364 !isNullFPConstant(Op2)) {
23365 // Convert to vectors, do a VSELECT, and convert back to scalar.
23366 // All of the conversions should be optimized away.
23367 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
23368 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
23369 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
23370 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
23371
23372 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
23373 VCmp = DAG.getBitcast(VCmpVT, VCmp);
23374
23375 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
23376
23377 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
23378 VSel, DAG.getIntPtrConstant(0, DL));
23379 }
23380 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
23381 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
23382 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
23383 }
23384 }
23385
23386 // AVX512 fallback is to lower selects of scalar floats to masked moves.
23387 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
23388 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
23389 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23390 }
23391
23392 if (Cond.getOpcode() == ISD::SETCC) {
23393 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
23394 Cond = NewCond;
23395 // If the condition was updated, it's possible that the operands of the
23396 // select were also updated (for example, EmitTest has a RAUW). Refresh
23397 // the local references to the select operands in case they got stale.
23398 Op1 = Op.getOperand(1);
23399 Op2 = Op.getOperand(2);
23400 }
23401 }
23402
23403 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
23404 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
23405 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
23406 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
23407 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
23408 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
23409 if (Cond.getOpcode() == X86ISD::SETCC &&
23410 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
23411 isNullConstant(Cond.getOperand(1).getOperand(1))) {
23412 SDValue Cmp = Cond.getOperand(1);
23413 SDValue CmpOp0 = Cmp.getOperand(0);
23414 unsigned CondCode = Cond.getConstantOperandVal(0);
23415
23416 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
23417 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
23418 // handle to keep the CMP with 0. This should be removed by
23419 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
23420 // cttz_zero_undef.
23421 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
23422 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
23423 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
23424 };
23425 if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
23426 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
23427 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
23428 // Keep Cmp.
23429 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23430 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
23431 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
23432
23433 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23434 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
23435
23436 // Apply further optimizations for special cases
23437 // (select (x != 0), -1, 0) -> neg & sbb
23438 // (select (x == 0), 0, -1) -> neg & sbb
23439 if (isNullConstant(Y) &&
23440 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
23441 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
23442 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
23443 Zero = DAG.getConstant(0, DL, Op.getValueType());
23444 return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
23445 }
23446
23447 Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
23448 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
23449
23450 SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
23451 SDValue Res = // Res = 0 or -1.
23452 DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
23453
23454 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
23455 Res = DAG.getNOT(DL, Res, Res.getValueType());
23456
23457 return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
23458 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
23459 Cmp.getOperand(0).getOpcode() == ISD::AND &&
23460 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
23461 SDValue Src1, Src2;
23462 // true if Op2 is XOR or OR operator and one of its operands
23463 // is equal to Op1
23464 // ( a , a op b) || ( b , a op b)
23465 auto isOrXorPattern = [&]() {
23466 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
23467 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
23468 Src1 =
23469 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
23470 Src2 = Op1;
23471 return true;
23472 }
23473 return false;
23474 };
23475
23476 if (isOrXorPattern()) {
23477 SDValue Neg;
23478 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
23479 // we need mask of all zeros or ones with same size of the other
23480 // operands.
23481 if (CmpSz > VT.getSizeInBits())
23482 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
23483 else if (CmpSz < VT.getSizeInBits())
23484 Neg = DAG.getNode(ISD::AND, DL, VT,
23485 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
23486 DAG.getConstant(1, DL, VT));
23487 else
23488 Neg = CmpOp0;
23489 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
23490 Neg); // -(and (x, 0x1))
23491 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
23492 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
23493 }
23494 }
23495 }
23496
23497 // Look past (and (setcc_carry (cmp ...)), 1).
23498 if (Cond.getOpcode() == ISD::AND &&
23499 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
23500 isOneConstant(Cond.getOperand(1)))
23501 Cond = Cond.getOperand(0);
23502
23503 // If condition flag is set by a X86ISD::CMP, then use it as the condition
23504 // setting operand in place of the X86ISD::SETCC.
23505 unsigned CondOpcode = Cond.getOpcode();
23506 if (CondOpcode == X86ISD::SETCC ||
23507 CondOpcode == X86ISD::SETCC_CARRY) {
23508 CC = Cond.getOperand(0);
23509
23510 SDValue Cmp = Cond.getOperand(1);
23511 bool IllegalFPCMov = false;
23512 if (VT.isFloatingPoint() && !VT.isVector() &&
23513 !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
23514 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
23515
23516 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
23517 Cmp.getOpcode() == X86ISD::BT) { // FIXME
23518 Cond = Cmp;
23519 AddTest = false;
23520 }
23521 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
23522 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
23523 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
23524 SDValue Value;
23525 X86::CondCode X86Cond;
23526 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23527
23528 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
23529 AddTest = false;
23530 }
23531
23532 if (AddTest) {
23533 // Look past the truncate if the high bits are known zero.
23534 if (isTruncWithZeroHighBitsInput(Cond, DAG))
23535 Cond = Cond.getOperand(0);
23536
23537 // We know the result of AND is compared against zero. Try to match
23538 // it to BT.
23539 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
23540 SDValue BTCC;
23541 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
23542 CC = BTCC;
23543 Cond = BT;
23544 AddTest = false;
23545 }
23546 }
23547 }
23548
23549 if (AddTest) {
23550 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
23551 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
23552 }
23553
23554 // a < b ? -1 : 0 -> RES = ~setcc_carry
23555 // a < b ? 0 : -1 -> RES = setcc_carry
23556 // a >= b ? -1 : 0 -> RES = setcc_carry
23557 // a >= b ? 0 : -1 -> RES = ~setcc_carry
23558 if (Cond.getOpcode() == X86ISD::SUB) {
23559 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
23560
23561 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
23562 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23563 (isNullConstant(Op1) || isNullConstant(Op2))) {
23564 SDValue Res =
23565 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
23566 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
23567 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
23568 return DAG.getNOT(DL, Res, Res.getValueType());
23569 return Res;
23570 }
23571 }
23572
23573 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
23574 // widen the cmov and push the truncate through. This avoids introducing a new
23575 // branch during isel and doesn't add any extensions.
23576 if (Op.getValueType() == MVT::i8 &&
23577 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
23578 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
23579 if (T1.getValueType() == T2.getValueType() &&
23580 // Exclude CopyFromReg to avoid partial register stalls.
23581 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
23582 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
23583 CC, Cond);
23584 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
23585 }
23586 }
23587
23588 // Or finally, promote i8 cmovs if we have CMOV,
23589 // or i16 cmovs if it won't prevent folding a load.
23590 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
23591 // legal, but EmitLoweredSelect() can not deal with these extensions
23592 // being inserted between two CMOV's. (in i16 case too TBN)
23593 // https://bugs.llvm.org/show_bug.cgi?id=40974
23594 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
23595 (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
23596 !MayFoldLoad(Op2))) {
23597 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
23598 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
23599 SDValue Ops[] = { Op2, Op1, CC, Cond };
23600 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
23601 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
23602 }
23603
23604 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
23605 // condition is true.
23606 SDValue Ops[] = { Op2, Op1, CC, Cond };
23607 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
23608}
23609
23610static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
23611 const X86Subtarget &Subtarget,
23612 SelectionDAG &DAG) {
23613 MVT VT = Op->getSimpleValueType(0);
23614 SDValue In = Op->getOperand(0);
23615 MVT InVT = In.getSimpleValueType();
23616 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23616, __PRETTY_FUNCTION__))
;
23617 MVT VTElt = VT.getVectorElementType();
23618 SDLoc dl(Op);
23619
23620 unsigned NumElts = VT.getVectorNumElements();
23621
23622 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
23623 MVT ExtVT = VT;
23624 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
23625 // If v16i32 is to be avoided, we'll need to split and concatenate.
23626 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
23627 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
23628
23629 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
23630 }
23631
23632 // Widen to 512-bits if VLX is not supported.
23633 MVT WideVT = ExtVT;
23634 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
23635 NumElts *= 512 / ExtVT.getSizeInBits();
23636 InVT = MVT::getVectorVT(MVT::i1, NumElts);
23637 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
23638 In, DAG.getIntPtrConstant(0, dl));
23639 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
23640 }
23641
23642 SDValue V;
23643 MVT WideEltVT = WideVT.getVectorElementType();
23644 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
23645 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
23646 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
23647 } else {
23648 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
23649 SDValue Zero = DAG.getConstant(0, dl, WideVT);
23650 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
23651 }
23652
23653 // Truncate if we had to extend i16/i8 above.
23654 if (VT != ExtVT) {
23655 WideVT = MVT::getVectorVT(VTElt, NumElts);
23656 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
23657 }
23658
23659 // Extract back to 128/256-bit if we widened.
23660 if (WideVT != VT)
23661 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
23662 DAG.getIntPtrConstant(0, dl));
23663
23664 return V;
23665}
23666
23667static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
23668 SelectionDAG &DAG) {
23669 SDValue In = Op->getOperand(0);
23670 MVT InVT = In.getSimpleValueType();
23671
23672 if (InVT.getVectorElementType() == MVT::i1)
23673 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
23674
23675 assert(Subtarget.hasAVX() && "Expected AVX support")((Subtarget.hasAVX() && "Expected AVX support") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23675, __PRETTY_FUNCTION__))
;
23676 return LowerAVXExtend(Op, DAG, Subtarget);
23677}
23678
23679// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
23680// For sign extend this needs to handle all vector sizes and SSE4.1 and
23681// non-SSE4.1 targets. For zero extend this should only handle inputs of
23682// MVT::v64i8 when BWI is not supported, but AVX512 is.
23683static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
23684 const X86Subtarget &Subtarget,
23685 SelectionDAG &DAG) {
23686 SDValue In = Op->getOperand(0);
23687 MVT VT = Op->getSimpleValueType(0);
23688 MVT InVT = In.getSimpleValueType();
23689
23690 MVT SVT = VT.getVectorElementType();
23691 MVT InSVT = InVT.getVectorElementType();
23692 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())((SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()) ?
static_cast<void> (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23692, __PRETTY_FUNCTION__))
;
23693
23694 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
23695 return SDValue();
23696 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
23697 return SDValue();
23698 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
23699 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
23700 !(VT.is512BitVector() && Subtarget.hasAVX512()))
23701 return SDValue();
23702
23703 SDLoc dl(Op);
23704 unsigned Opc = Op.getOpcode();
23705 unsigned NumElts = VT.getVectorNumElements();
23706
23707 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
23708 // For 512-bit vectors, we need 128-bits or 256-bits.
23709 if (InVT.getSizeInBits() > 128) {
23710 // Input needs to be at least the same number of elements as output, and
23711 // at least 128-bits.
23712 int InSize = InSVT.getSizeInBits() * NumElts;
23713 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
23714 InVT = In.getSimpleValueType();
23715 }
23716
23717 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
23718 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
23719 // need to be handled here for 256/512-bit results.
23720 if (Subtarget.hasInt256()) {
23721 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")((VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23721, __PRETTY_FUNCTION__))
;
23722
23723 if (InVT.getVectorNumElements() != NumElts)
23724 return DAG.getNode(Op.getOpcode(), dl, VT, In);
23725
23726 // FIXME: Apparently we create inreg operations that could be regular
23727 // extends.
23728 unsigned ExtOpc =
23729 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
23730 : ISD::ZERO_EXTEND;
23731 return DAG.getNode(ExtOpc, dl, VT, In);
23732 }
23733
23734 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
23735 if (Subtarget.hasAVX()) {
23736 assert(VT.is256BitVector() && "256-bit vector expected")((VT.is256BitVector() && "256-bit vector expected") ?
static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23736, __PRETTY_FUNCTION__))
;
23737 MVT HalfVT = VT.getHalfNumVectorElementsVT();
23738 int HalfNumElts = HalfVT.getVectorNumElements();
23739
23740 unsigned NumSrcElts = InVT.getVectorNumElements();
23741 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
23742 for (int i = 0; i != HalfNumElts; ++i)
23743 HiMask[i] = HalfNumElts + i;
23744
23745 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
23746 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
23747 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
23748 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
23749 }
23750
23751 // We should only get here for sign extend.
23752 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")((Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23752, __PRETTY_FUNCTION__))
;
23753 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")((VT.is128BitVector() && InVT.is128BitVector() &&
"Unexpected VTs") ? static_cast<void> (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23753, __PRETTY_FUNCTION__))
;
23754
23755 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
23756 SDValue Curr = In;
23757 SDValue SignExt = Curr;
23758
23759 // As SRAI is only available on i16/i32 types, we expand only up to i32
23760 // and handle i64 separately.
23761 if (InVT != MVT::v4i32) {
23762 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
23763
23764 unsigned DestWidth = DestVT.getScalarSizeInBits();
23765 unsigned Scale = DestWidth / InSVT.getSizeInBits();
23766
23767 unsigned InNumElts = InVT.getVectorNumElements();
23768 unsigned DestElts = DestVT.getVectorNumElements();
23769
23770 // Build a shuffle mask that takes each input element and places it in the
23771 // MSBs of the new element size.
23772 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
23773 for (unsigned i = 0; i != DestElts; ++i)
23774 Mask[i * Scale + (Scale - 1)] = i;
23775
23776 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
23777 Curr = DAG.getBitcast(DestVT, Curr);
23778
23779 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
23780 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
23781 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
23782 }
23783
23784 if (VT == MVT::v2i64) {
23785 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")((Curr.getValueType() == MVT::v4i32 && "Unexpected input VT"
) ? static_cast<void> (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23785, __PRETTY_FUNCTION__))
;
23786 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
23787 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
23788 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
23789 SignExt = DAG.getBitcast(VT, SignExt);
23790 }
23791
23792 return SignExt;
23793}
23794
23795static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
23796 SelectionDAG &DAG) {
23797 MVT VT = Op->getSimpleValueType(0);
23798 SDValue In = Op->getOperand(0);
23799 MVT InVT = In.getSimpleValueType();
23800 SDLoc dl(Op);
23801
23802 if (InVT.getVectorElementType() == MVT::i1)
23803 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
23804
23805 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((VT.isVector() && InVT.isVector() && "Expected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23805, __PRETTY_FUNCTION__))
;
23806 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23807, __PRETTY_FUNCTION__))
23807 "Expected same number of elements")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23807, __PRETTY_FUNCTION__))
;
23808 assert((VT.getVectorElementType() == MVT::i16 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23811, __PRETTY_FUNCTION__))
23809 VT.getVectorElementType() == MVT::i32 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23811, __PRETTY_FUNCTION__))
23810 VT.getVectorElementType() == MVT::i64) &&(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23811, __PRETTY_FUNCTION__))
23811 "Unexpected element type")(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23811, __PRETTY_FUNCTION__))
;
23812 assert((InVT.getVectorElementType() == MVT::i8 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23815, __PRETTY_FUNCTION__))
23813 InVT.getVectorElementType() == MVT::i16 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23815, __PRETTY_FUNCTION__))
23814 InVT.getVectorElementType() == MVT::i32) &&(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23815, __PRETTY_FUNCTION__))
23815 "Unexpected element type")(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23815, __PRETTY_FUNCTION__))
;
23816
23817 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
23818 assert(InVT == MVT::v32i8 && "Unexpected VT!")((InVT == MVT::v32i8 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23818, __PRETTY_FUNCTION__))
;
23819 return splitVectorIntUnary(Op, DAG);
23820 }
23821
23822 if (Subtarget.hasInt256())
23823 return Op;
23824
23825 // Optimize vectors in AVX mode
23826 // Sign extend v8i16 to v8i32 and
23827 // v4i32 to v4i64
23828 //
23829 // Divide input vector into two parts
23830 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
23831 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
23832 // concat the vectors to original VT
23833 MVT HalfVT = VT.getHalfNumVectorElementsVT();
23834 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
23835
23836 unsigned NumElems = InVT.getVectorNumElements();
23837 SmallVector<int,8> ShufMask(NumElems, -1);
23838 for (unsigned i = 0; i != NumElems/2; ++i)
23839 ShufMask[i] = i + NumElems/2;
23840
23841 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
23842 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
23843
23844 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
23845}
23846
23847/// Change a vector store into a pair of half-size vector stores.
23848static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
23849 SDValue StoredVal = Store->getValue();
23850 assert((StoredVal.getValueType().is256BitVector() ||(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23852, __PRETTY_FUNCTION__))
23851 StoredVal.getValueType().is512BitVector()) &&(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23852, __PRETTY_FUNCTION__))
23852 "Expecting 256/512-bit op")(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23852, __PRETTY_FUNCTION__))
;
23853
23854 // Splitting volatile memory ops is not allowed unless the operation was not
23855 // legal to begin with. Assume the input store is legal (this transform is
23856 // only used for targets with AVX). Note: It is possible that we have an
23857 // illegal type like v2i128, and so we could allow splitting a volatile store
23858 // in that case if that is important.
23859 if (!Store->isSimple())
23860 return SDValue();
23861
23862 SDLoc DL(Store);
23863 SDValue Value0, Value1;
23864 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
23865 unsigned HalfOffset = Value0.getValueType().getStoreSize();
23866 SDValue Ptr0 = Store->getBasePtr();
23867 SDValue Ptr1 =
23868 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
23869 SDValue Ch0 =
23870 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
23871 Store->getOriginalAlign(),
23872 Store->getMemOperand()->getFlags());
23873 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
23874 Store->getPointerInfo().getWithOffset(HalfOffset),
23875 Store->getOriginalAlign(),
23876 Store->getMemOperand()->getFlags());
23877 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
23878}
23879
23880/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
23881/// type.
23882static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
23883 SelectionDAG &DAG) {
23884 SDValue StoredVal = Store->getValue();
23885 assert(StoreVT.is128BitVector() &&((StoreVT.is128BitVector() && StoredVal.getValueType(
).is128BitVector() && "Expecting 128-bit op") ? static_cast
<void> (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23886, __PRETTY_FUNCTION__))
23886 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")((StoreVT.is128BitVector() && StoredVal.getValueType(
).is128BitVector() && "Expecting 128-bit op") ? static_cast
<void> (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23886, __PRETTY_FUNCTION__))
;
23887 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
23888
23889 // Splitting volatile memory ops is not allowed unless the operation was not
23890 // legal to begin with. We are assuming the input op is legal (this transform
23891 // is only used for targets with AVX).
23892 if (!Store->isSimple())
23893 return SDValue();
23894
23895 MVT StoreSVT = StoreVT.getScalarType();
23896 unsigned NumElems = StoreVT.getVectorNumElements();
23897 unsigned ScalarSize = StoreSVT.getStoreSize();
23898
23899 SDLoc DL(Store);
23900 SmallVector<SDValue, 4> Stores;
23901 for (unsigned i = 0; i != NumElems; ++i) {
23902 unsigned Offset = i * ScalarSize;
23903 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
23904 TypeSize::Fixed(Offset), DL);
23905 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
23906 DAG.getIntPtrConstant(i, DL));
23907 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
23908 Store->getPointerInfo().getWithOffset(Offset),
23909 Store->getOriginalAlign(),
23910 Store->getMemOperand()->getFlags());
23911 Stores.push_back(Ch);
23912 }
23913 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
23914}
23915
23916static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
23917 SelectionDAG &DAG) {
23918 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
23919 SDLoc dl(St);
23920 SDValue StoredVal = St->getValue();
23921
23922 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
23923 if (StoredVal.getValueType().isVector() &&
23924 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
23925 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
23926 assert(NumElts <= 8 && "Unexpected VT")((NumElts <= 8 && "Unexpected VT") ? static_cast<
void> (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23926, __PRETTY_FUNCTION__))
;
23927 assert(!St->isTruncatingStore() && "Expected non-truncating store")((!St->isTruncatingStore() && "Expected non-truncating store"
) ? static_cast<void> (0) : __assert_fail ("!St->isTruncatingStore() && \"Expected non-truncating store\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23927, __PRETTY_FUNCTION__))
;
23928 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23929, __PRETTY_FUNCTION__))
23929 "Expected AVX512F without AVX512DQI")((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23929, __PRETTY_FUNCTION__))
;
23930
23931 // We must pad with zeros to ensure we store zeroes to any unused bits.
23932 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
23933 DAG.getUNDEF(MVT::v16i1), StoredVal,
23934 DAG.getIntPtrConstant(0, dl));
23935 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
23936 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
23937 // Make sure we store zeros in the extra bits.
23938 if (NumElts < 8)
23939 StoredVal = DAG.getZeroExtendInReg(
23940 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
23941
23942 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
23943 St->getPointerInfo(), St->getOriginalAlign(),
23944 St->getMemOperand()->getFlags());
23945 }
23946
23947 if (St->isTruncatingStore())
23948 return SDValue();
23949
23950 // If this is a 256-bit store of concatenated ops, we are better off splitting
23951 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
23952 // and each half can execute independently. Some cores would split the op into
23953 // halves anyway, so the concat (vinsertf128) is purely an extra op.
23954 MVT StoreVT = StoredVal.getSimpleValueType();
23955 if (StoreVT.is256BitVector() ||
23956 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
23957 !Subtarget.hasBWI())) {
23958 SmallVector<SDValue, 4> CatOps;
23959 if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
23960 return splitVectorStore(St, DAG);
23961 return SDValue();
23962 }
23963
23964 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23965 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&((StoreVT.isVector() && StoreVT.getSizeInBits() == 64
&& "Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23966, __PRETTY_FUNCTION__))
23966 "Unexpected VT")((StoreVT.isVector() && StoreVT.getSizeInBits() == 64
&& "Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23966, __PRETTY_FUNCTION__))
;
23967 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==((TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering
::TypeWidenVector && "Unexpected type action!") ? static_cast
<void> (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23968, __PRETTY_FUNCTION__))
23968 TargetLowering::TypeWidenVector && "Unexpected type action!")((TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering
::TypeWidenVector && "Unexpected type action!") ? static_cast
<void> (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23968, __PRETTY_FUNCTION__))
;
23969
23970 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
23971 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
23972 DAG.getUNDEF(StoreVT));
23973
23974 if (Subtarget.hasSSE2()) {
23975 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
23976 // and store it.
23977 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
23978 MVT CastVT = MVT::getVectorVT(StVT, 2);
23979 StoredVal = DAG.getBitcast(CastVT, StoredVal);
23980 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
23981 DAG.getIntPtrConstant(0, dl));
23982
23983 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
23984 St->getPointerInfo(), St->getOriginalAlign(),
23985 St->getMemOperand()->getFlags());
23986 }
23987 assert(Subtarget.hasSSE1() && "Expected SSE")((Subtarget.hasSSE1() && "Expected SSE") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23987, __PRETTY_FUNCTION__))
;
23988 SDVTList Tys = DAG.getVTList(MVT::Other);
23989 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
23990 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
23991 St->getMemOperand());
23992}
23993
23994// Lower vector extended loads using a shuffle. If SSSE3 is not available we
23995// may emit an illegal shuffle but the expansion is still better than scalar
23996// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
23997// we'll emit a shuffle and a arithmetic shift.
23998// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
23999// TODO: It is possible to support ZExt by zeroing the undef values during
24000// the shuffle phase or after the shuffle.
24001static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24002 SelectionDAG &DAG) {
24003 MVT RegVT = Op.getSimpleValueType();
24004 assert(RegVT.isVector() && "We only custom lower vector loads.")((RegVT.isVector() && "We only custom lower vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24004, __PRETTY_FUNCTION__))
;
24005 assert(RegVT.isInteger() &&((RegVT.isInteger() && "We only custom lower integer vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24006, __PRETTY_FUNCTION__))
24006 "We only custom lower integer vector loads.")((RegVT.isInteger() && "We only custom lower integer vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24006, __PRETTY_FUNCTION__))
;
24007
24008 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24009 SDLoc dl(Ld);
24010
24011 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24012 if (RegVT.getVectorElementType() == MVT::i1) {
24013 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")((EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load"
) ? static_cast<void> (0) : __assert_fail ("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24013, __PRETTY_FUNCTION__))
;
24014 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")((RegVT.getVectorNumElements() <= 8 && "Unexpected VT"
) ? static_cast<void> (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24014, __PRETTY_FUNCTION__))
;
24015 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24016, __PRETTY_FUNCTION__))
24016 "Expected AVX512F without AVX512DQI")((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24016, __PRETTY_FUNCTION__))
;
24017
24018 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24019 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24020 Ld->getMemOperand()->getFlags());
24021
24022 // Replace chain users with the new chain.
24023 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")((NewLd->getNumValues() == 2 && "Loads must carry a chain!"
) ? static_cast<void> (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24023, __PRETTY_FUNCTION__))
;
24024
24025 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24026 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24027 DAG.getBitcast(MVT::v16i1, Val),
24028 DAG.getIntPtrConstant(0, dl));
24029 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24030 }
24031
24032 return SDValue();
24033}
24034
24035/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24036/// each of which has no other use apart from the AND / OR.
24037static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24038 Opc = Op.getOpcode();
24039 if (Opc != ISD::OR && Opc != ISD::AND)
24040 return false;
24041 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24042 Op.getOperand(0).hasOneUse() &&
24043 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24044 Op.getOperand(1).hasOneUse());
24045}
24046
24047SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24048 SDValue Chain = Op.getOperand(0);
24049 SDValue Cond = Op.getOperand(1);
24050 SDValue Dest = Op.getOperand(2);
24051 SDLoc dl(Op);
24052
24053 if (Cond.getOpcode() == ISD::SETCC &&
24054 Cond.getOperand(0).getValueType() != MVT::f128) {
24055 SDValue LHS = Cond.getOperand(0);
24056 SDValue RHS = Cond.getOperand(1);
24057 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24058
24059 // Special case for
24060 // setcc([su]{add,sub,mul}o == 0)
24061 // setcc([su]{add,sub,mul}o != 1)
24062 if (ISD::isOverflowIntrOpRes(LHS) &&
24063 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24064 (isNullConstant(RHS) || isOneConstant(RHS))) {
24065 SDValue Value, Overflow;
24066 X86::CondCode X86Cond;
24067 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24068
24069 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24070 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24071
24072 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24073 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24074 Overflow);
24075 }
24076
24077 if (LHS.getSimpleValueType().isInteger()) {
24078 SDValue CCVal;
24079 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24080 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24081 EFLAGS);
24082 }
24083
24084 if (CC == ISD::SETOEQ) {
24085 // For FCMP_OEQ, we can emit
24086 // two branches instead of an explicit AND instruction with a
24087 // separate test. However, we only do this if this block doesn't
24088 // have a fall-through edge, because this requires an explicit
24089 // jmp when the condition is false.
24090 if (Op.getNode()->hasOneUse()) {
24091 SDNode *User = *Op.getNode()->use_begin();
24092 // Look for an unconditional branch following this conditional branch.
24093 // We need this because we need to reverse the successors in order
24094 // to implement FCMP_OEQ.
24095 if (User->getOpcode() == ISD::BR) {
24096 SDValue FalseBB = User->getOperand(1);
24097 SDNode *NewBR =
24098 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24099 assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail
("NewBR == User", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24099, __PRETTY_FUNCTION__))
;
24100 (void)NewBR;
24101 Dest = FalseBB;
24102
24103 SDValue Cmp =
24104 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24105 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24106 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24107 CCVal, Cmp);
24108 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24109 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24110 Cmp);
24111 }
24112 }
24113 } else if (CC == ISD::SETUNE) {
24114 // For FCMP_UNE, we can emit
24115 // two branches instead of an explicit OR instruction with a
24116 // separate test.
24117 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24118 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24119 Chain =
24120 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24121 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24122 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24123 Cmp);
24124 } else {
24125 X86::CondCode X86Cond =
24126 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24127 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24128 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24129 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24130 Cmp);
24131 }
24132 }
24133
24134 if (ISD::isOverflowIntrOpRes(Cond)) {
24135 SDValue Value, Overflow;
24136 X86::CondCode X86Cond;
24137 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24138
24139 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24140 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24141 Overflow);
24142 }
24143
24144 // Look past the truncate if the high bits are known zero.
24145 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24146 Cond = Cond.getOperand(0);
24147
24148 EVT CondVT = Cond.getValueType();
24149
24150 // Add an AND with 1 if we don't already have one.
24151 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24152 Cond =
24153 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24154
24155 SDValue LHS = Cond;
24156 SDValue RHS = DAG.getConstant(0, dl, CondVT);
24157
24158 SDValue CCVal;
24159 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24160 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24161 EFLAGS);
24162}
24163
24164// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24165// Calls to _alloca are needed to probe the stack when allocating more than 4k
24166// bytes in one go. Touching the stack at 4K increments is necessary to ensure
24167// that the guard pages used by the OS virtual memory manager are allocated in
24168// correct sequence.
24169SDValue
24170X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24171 SelectionDAG &DAG) const {
24172 MachineFunction &MF = DAG.getMachineFunction();
24173 bool SplitStack = MF.shouldSplitStack();
24174 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24175 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24176 SplitStack || EmitStackProbeCall;
24177 SDLoc dl(Op);
24178
24179 // Get the inputs.
24180 SDNode *Node = Op.getNode();
24181 SDValue Chain = Op.getOperand(0);
24182 SDValue Size = Op.getOperand(1);
24183 MaybeAlign Alignment(Op.getConstantOperandVal(2));
24184 EVT VT = Node->getValueType(0);
24185
24186 // Chain the dynamic stack allocation so that it doesn't modify the stack
24187 // pointer when other instructions are using the stack.
24188 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24189
24190 bool Is64Bit = Subtarget.is64Bit();
24191 MVT SPTy = getPointerTy(DAG.getDataLayout());
24192
24193 SDValue Result;
24194 if (!Lower) {
24195 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24196 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
24197 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24198, __PRETTY_FUNCTION__))
24198 " not tell us which reg is the stack pointer!")((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24198, __PRETTY_FUNCTION__))
;
24199
24200 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24201 const Align StackAlign = TFI.getStackAlign();
24202 if (hasInlineStackProbe(MF)) {
24203 MachineRegisterInfo &MRI = MF.getRegInfo();
24204
24205 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24206 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24207 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24208 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24209 DAG.getRegister(Vreg, SPTy));
24210 } else {
24211 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24212 Chain = SP.getValue(1);
24213 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24214 }
24215 if (Alignment && *Alignment > StackAlign)
24216 Result =
24217 DAG.getNode(ISD::AND, dl, VT, Result,
24218 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24219 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24220 } else if (SplitStack) {
24221 MachineRegisterInfo &MRI = MF.getRegInfo();
24222
24223 if (Is64Bit) {
24224 // The 64 bit implementation of segmented stacks needs to clobber both r10
24225 // r11. This makes it impossible to use it along with nested parameters.
24226 const Function &F = MF.getFunction();
24227 for (const auto &A : F.args()) {
24228 if (A.hasNestAttr())
24229 report_fatal_error("Cannot use segmented stacks with functions that "
24230 "have nested arguments.");
24231 }
24232 }
24233
24234 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24235 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24236 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24237 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24238 DAG.getRegister(Vreg, SPTy));
24239 } else {
24240 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24241 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
24242 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
24243
24244 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24245 Register SPReg = RegInfo->getStackRegister();
24246 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24247 Chain = SP.getValue(1);
24248
24249 if (Alignment) {
24250 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24251 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24252 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24253 }
24254
24255 Result = SP;
24256 }
24257
24258 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
24259 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
24260
24261 SDValue Ops[2] = {Result, Chain};
24262 return DAG.getMergeValues(Ops, dl);
24263}
24264
24265SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24266 MachineFunction &MF = DAG.getMachineFunction();
24267 auto PtrVT = getPointerTy(MF.getDataLayout());
24268 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
24269
24270 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24271 SDLoc DL(Op);
24272
24273 if (!Subtarget.is64Bit() ||
24274 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24275 // vastart just stores the address of the VarArgsFrameIndex slot into the
24276 // memory location argument.
24277 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24278 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24279 MachinePointerInfo(SV));
24280 }
24281
24282 // __va_list_tag:
24283 // gp_offset (0 - 6 * 8)
24284 // fp_offset (48 - 48 + 8 * 16)
24285 // overflow_arg_area (point to parameters coming in memory).
24286 // reg_save_area
24287 SmallVector<SDValue, 8> MemOps;
24288 SDValue FIN = Op.getOperand(1);
24289 // Store gp_offset
24290 SDValue Store = DAG.getStore(
24291 Op.getOperand(0), DL,
24292 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24293 MachinePointerInfo(SV));
24294 MemOps.push_back(Store);
24295
24296 // Store fp_offset
24297 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
24298 Store = DAG.getStore(
24299 Op.getOperand(0), DL,
24300 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
24301 MachinePointerInfo(SV, 4));
24302 MemOps.push_back(Store);
24303
24304 // Store ptr to overflow_arg_area
24305 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
24306 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24307 Store =
24308 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
24309 MemOps.push_back(Store);
24310
24311 // Store ptr to reg_save_area.
24312 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
24313 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
24314 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
24315 Store = DAG.getStore(
24316 Op.getOperand(0), DL, RSFIN, FIN,
24317 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
24318 MemOps.push_back(Store);
24319 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
24320}
24321
24322SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
24323 assert(Subtarget.is64Bit() &&((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24324, __PRETTY_FUNCTION__))
24324 "LowerVAARG only handles 64-bit va_arg!")((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24324, __PRETTY_FUNCTION__))
;
24325 assert(Op.getNumOperands() == 4)((Op.getNumOperands() == 4) ? static_cast<void> (0) : __assert_fail
("Op.getNumOperands() == 4", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24325, __PRETTY_FUNCTION__))
;
24326
24327 MachineFunction &MF = DAG.getMachineFunction();
24328 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
24329 // The Win64 ABI uses char* instead of a structure.
24330 return DAG.expandVAArg(Op.getNode());
24331
24332 SDValue Chain = Op.getOperand(0);
24333 SDValue SrcPtr = Op.getOperand(1);
24334 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24335 unsigned Align = Op.getConstantOperandVal(3);
24336 SDLoc dl(Op);
24337
24338 EVT ArgVT = Op.getNode()->getValueType(0);
24339 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24340 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
24341 uint8_t ArgMode;
24342
24343 // Decide which area this value should be read from.
24344 // TODO: Implement the AMD64 ABI in its entirety. This simple
24345 // selection mechanism works only for the basic types.
24346 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")((ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? static_cast<void> (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24346, __PRETTY_FUNCTION__))
;
24347 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
24348 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
24349 } else {
24350 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&((ArgVT.isInteger() && ArgSize <= 32 && "Unhandled argument type in LowerVAARG"
) ? static_cast<void> (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24351, __PRETTY_FUNCTION__))
24351 "Unhandled argument type in LowerVAARG")((ArgVT.isInteger() && ArgSize <= 32 && "Unhandled argument type in LowerVAARG"
) ? static_cast<void> (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24351, __PRETTY_FUNCTION__))
;
24352 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
24353 }
24354
24355 if (ArgMode == 2) {
24356 // Sanity Check: Make sure using fp_offset makes sense.
24357 assert(!Subtarget.useSoftFloat() &&((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24359, __PRETTY_FUNCTION__))
24358 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24359, __PRETTY_FUNCTION__))
24359 Subtarget.hasSSE1())((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24359, __PRETTY_FUNCTION__))
;
24360 }
24361
24362 // Insert VAARG_64 node into the DAG
24363 // VAARG_64 returns two values: Variable Argument Address, Chain
24364 SDValue InstOps[] = {Chain, SrcPtr,
24365 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
24366 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
24367 DAG.getTargetConstant(Align, dl, MVT::i32)};
24368 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
24369 SDValue VAARG = DAG.getMemIntrinsicNode(
24370 X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
24371 /*Alignment=*/None,
24372 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
24373 Chain = VAARG.getValue(1);
24374
24375 // Load the next argument and return it
24376 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
24377}
24378
24379static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
24380 SelectionDAG &DAG) {
24381 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
24382 // where a va_list is still an i8*.
24383 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")((Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24383, __PRETTY_FUNCTION__))
;
24384 if (Subtarget.isCallingConvWin64(
24385 DAG.getMachineFunction().getFunction().getCallingConv()))
24386 // Probably a Win64 va_copy.
24387 return DAG.expandVACopy(Op.getNode());
24388
24389 SDValue Chain = Op.getOperand(0);
24390 SDValue DstPtr = Op.getOperand(1);
24391 SDValue SrcPtr = Op.getOperand(2);
24392 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
24393 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
24394 SDLoc DL(Op);
24395
24396 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),
24397 Align(8), /*isVolatile*/ false, false, false,
24398 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
24399}
24400
24401// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
24402static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
24403 switch (Opc) {
24404 case ISD::SHL:
24405 case X86ISD::VSHL:
24406 case X86ISD::VSHLI:
24407 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
24408 case ISD::SRL:
24409 case X86ISD::VSRL:
24410 case X86ISD::VSRLI:
24411 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
24412 case ISD::SRA:
24413 case X86ISD::VSRA:
24414 case X86ISD::VSRAI:
24415 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
24416 }
24417 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24417)
;
24418}
24419
24420/// Handle vector element shifts where the shift amount is a constant.
24421/// Takes immediate version of shift as input.
24422static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
24423 SDValue SrcOp, uint64_t ShiftAmt,
24424 SelectionDAG &DAG) {
24425 MVT ElementType = VT.getVectorElementType();
24426
24427 // Bitcast the source vector to the output type, this is mainly necessary for
24428 // vXi8/vXi64 shifts.
24429 if (VT != SrcOp.getSimpleValueType())
24430 SrcOp = DAG.getBitcast(VT, SrcOp);
24431
24432 // Fold this packed shift into its first operand if ShiftAmt is 0.
24433 if (ShiftAmt == 0)
24434 return SrcOp;
24435
24436 // Check for ShiftAmt >= element width
24437 if (ShiftAmt >= ElementType.getSizeInBits()) {
24438 if (Opc == X86ISD::VSRAI)
24439 ShiftAmt = ElementType.getSizeInBits() - 1;
24440 else
24441 return DAG.getConstant(0, dl, VT);
24442 }
24443
24444 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24445, __PRETTY_FUNCTION__))
24445 && "Unknown target vector shift-by-constant node")(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24445, __PRETTY_FUNCTION__))
;
24446
24447 // Fold this packed vector shift into a build vector if SrcOp is a
24448 // vector of Constants or UNDEFs.
24449 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
24450 SmallVector<SDValue, 8> Elts;
24451 unsigned NumElts = SrcOp->getNumOperands();
24452
24453 switch (Opc) {
24454 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24454)
;
24455 case X86ISD::VSHLI:
24456 for (unsigned i = 0; i != NumElts; ++i) {
24457 SDValue CurrentOp = SrcOp->getOperand(i);
24458 if (CurrentOp->isUndef()) {
24459 // Must produce 0s in the correct bits.
24460 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24461 continue;
24462 }
24463 auto *ND = cast<ConstantSDNode>(CurrentOp);
24464 const APInt &C = ND->getAPIntValue();
24465 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
24466 }
24467 break;
24468 case X86ISD::VSRLI:
24469 for (unsigned i = 0; i != NumElts; ++i) {
24470 SDValue CurrentOp = SrcOp->getOperand(i);
24471 if (CurrentOp->isUndef()) {
24472 // Must produce 0s in the correct bits.
24473 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24474 continue;
24475 }
24476 auto *ND = cast<ConstantSDNode>(CurrentOp);
24477 const APInt &C = ND->getAPIntValue();
24478 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
24479 }
24480 break;
24481 case X86ISD::VSRAI:
24482 for (unsigned i = 0; i != NumElts; ++i) {
24483 SDValue CurrentOp = SrcOp->getOperand(i);
24484 if (CurrentOp->isUndef()) {
24485 // All shifted in bits must be the same so use 0.
24486 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24487 continue;
24488 }
24489 auto *ND = cast<ConstantSDNode>(CurrentOp);
24490 const APInt &C = ND->getAPIntValue();
24491 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
24492 }
24493 break;
24494 }
24495
24496 return DAG.getBuildVector(VT, dl, Elts);
24497 }
24498
24499 return DAG.getNode(Opc, dl, VT, SrcOp,
24500 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
24501}
24502
24503/// Handle vector element shifts where the shift amount may or may not be a
24504/// constant. Takes immediate version of shift as input.
24505static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
24506 SDValue SrcOp, SDValue ShAmt,
24507 const X86Subtarget &Subtarget,
24508 SelectionDAG &DAG) {
24509 MVT SVT = ShAmt.getSimpleValueType();
24510 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")(((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"
) ? static_cast<void> (0) : __assert_fail ("(SVT == MVT::i32 || SVT == MVT::i64) && \"Unexpected value type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24510, __PRETTY_FUNCTION__))
;
24511
24512 // Catch shift-by-constant.
24513 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
24514 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
24515 CShAmt->getZExtValue(), DAG);
24516
24517 // Change opcode to non-immediate version.
24518 Opc = getTargetVShiftUniformOpcode(Opc, true);
24519
24520 // Need to build a vector containing shift amount.
24521 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
24522 // +====================+============+=======================================+
24523 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
24524 // +====================+============+=======================================+
24525 // | i64 | Yes, No | Use ShAmt as lowest elt |
24526 // | i32 | Yes | zero-extend in-reg |
24527 // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
24528 // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
24529 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
24530 // +====================+============+=======================================+
24531
24532 if (SVT == MVT::i64)
24533 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
24534 else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
24535 ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24536 (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
24537 ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
24538 ShAmt = ShAmt.getOperand(0);
24539 MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
24540 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
24541 if (Subtarget.hasSSE41())
24542 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
24543 MVT::v2i64, ShAmt);
24544 else {
24545 SDValue ByteShift = DAG.getTargetConstant(
24546 (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
24547 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
24548 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
24549 ByteShift);
24550 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
24551 ByteShift);
24552 }
24553 } else if (Subtarget.hasSSE41() &&
24554 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24555 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
24556 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
24557 MVT::v2i64, ShAmt);
24558 } else {
24559 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
24560 DAG.getUNDEF(SVT)};
24561 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
24562 }
24563
24564 // The return type has to be a 128-bit type with the same element
24565 // type as the input type.
24566 MVT EltVT = VT.getVectorElementType();
24567 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
24568
24569 ShAmt = DAG.getBitcast(ShVT, ShAmt);
24570 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
24571}
24572
24573/// Return Mask with the necessary casting or extending
24574/// for \p Mask according to \p MaskVT when lowering masking intrinsics
24575static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
24576 const X86Subtarget &Subtarget, SelectionDAG &DAG,
24577 const SDLoc &dl) {
24578
24579 if (isAllOnesConstant(Mask))
24580 return DAG.getConstant(1, dl, MaskVT);
24581 if (X86::isZeroNode(Mask))
24582 return DAG.getConstant(0, dl, MaskVT);
24583
24584 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")((MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24584, __PRETTY_FUNCTION__))
;
24585
24586 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
24587 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")((MaskVT == MVT::v64i1 && "Expected v64i1 mask!") ? static_cast
<void> (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24587, __PRETTY_FUNCTION__))
;
24588 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((Subtarget.hasBWI() && "Expected AVX512BW target!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24588, __PRETTY_FUNCTION__))
;
24589 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
24590 SDValue Lo, Hi;
24591 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
24592 DAG.getConstant(0, dl, MVT::i32));
24593 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
24594 DAG.getConstant(1, dl, MVT::i32));
24595
24596 Lo = DAG.getBitcast(MVT::v32i1, Lo);
24597 Hi = DAG.getBitcast(MVT::v32i1, Hi);
24598
24599 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
24600 } else {
24601 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
24602 Mask.getSimpleValueType().getSizeInBits());
24603 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
24604 // are extracted by EXTRACT_SUBVECTOR.
24605 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
24606 DAG.getBitcast(BitcastVT, Mask),
24607 DAG.getIntPtrConstant(0, dl));
24608 }
24609}
24610
24611/// Return (and \p Op, \p Mask) for compare instructions or
24612/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
24613/// necessary casting or extending for \p Mask when lowering masking intrinsics
24614static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
24615 SDValue PreservedSrc,
24616 const X86Subtarget &Subtarget,
24617 SelectionDAG &DAG) {
24618 MVT VT = Op.getSimpleValueType();
24619 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
24620 unsigned OpcodeSelect = ISD::VSELECT;
24621 SDLoc dl(Op);
24622
24623 if (isAllOnesConstant(Mask))
24624 return Op;
24625
24626 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24627
24628 if (PreservedSrc.isUndef())
24629 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
24630 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
24631}
24632
24633/// Creates an SDNode for a predicated scalar operation.
24634/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
24635/// The mask is coming as MVT::i8 and it should be transformed
24636/// to MVT::v1i1 while lowering masking intrinsics.
24637/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
24638/// "X86select" instead of "vselect". We just can't create the "vselect" node
24639/// for a scalar instruction.
24640static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
24641 SDValue PreservedSrc,
24642 const X86Subtarget &Subtarget,
24643 SelectionDAG &DAG) {
24644
24645 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
24646 if (MaskConst->getZExtValue() & 0x1)
24647 return Op;
24648
24649 MVT VT = Op.getSimpleValueType();
24650 SDLoc dl(Op);
24651
24652 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")((Mask.getValueType() == MVT::i8 && "Unexpect type") ?
static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24652, __PRETTY_FUNCTION__))
;
24653 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
24654 DAG.getBitcast(MVT::v8i1, Mask),
24655 DAG.getIntPtrConstant(0, dl));
24656 if (Op.getOpcode() == X86ISD::FSETCCM ||
24657 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
24658 Op.getOpcode() == X86ISD::VFPCLASSS)
24659 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
24660
24661 if (PreservedSrc.isUndef())
24662 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
24663 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
24664}
24665
24666static int getSEHRegistrationNodeSize(const Function *Fn) {
24667 if (!Fn->hasPersonalityFn())
24668 report_fatal_error(
24669 "querying registration node size for function without personality");
24670 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
24671 // WinEHStatePass for the full struct definition.
24672 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
24673 case EHPersonality::MSVC_X86SEH: return 24;
24674 case EHPersonality::MSVC_CXX: return 16;
24675 default: break;
24676 }
24677 report_fatal_error(
24678 "can only recover FP for 32-bit MSVC EH personality functions");
24679}
24680
24681/// When the MSVC runtime transfers control to us, either to an outlined
24682/// function or when returning to a parent frame after catching an exception, we
24683/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
24684/// Here's the math:
24685/// RegNodeBase = EntryEBP - RegNodeSize
24686/// ParentFP = RegNodeBase - ParentFrameOffset
24687/// Subtracting RegNodeSize takes us to the offset of the registration node, and
24688/// subtracting the offset (negative on x86) takes us back to the parent FP.
24689static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
24690 SDValue EntryEBP) {
24691 MachineFunction &MF = DAG.getMachineFunction();
24692 SDLoc dl;
24693
24694 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24695 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24696
24697 // It's possible that the parent function no longer has a personality function
24698 // if the exceptional code was optimized away, in which case we just return
24699 // the incoming EBP.
24700 if (!Fn->hasPersonalityFn())
24701 return EntryEBP;
24702
24703 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
24704 // registration, or the .set_setframe offset.
24705 MCSymbol *OffsetSym =
24706 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
24707 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
24708 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
24709 SDValue ParentFrameOffset =
24710 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
24711
24712 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
24713 // prologue to RBP in the parent function.
24714 const X86Subtarget &Subtarget =
24715 static_cast<const X86Subtarget &>(DAG.getSubtarget());
24716 if (Subtarget.is64Bit())
24717 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
24718
24719 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
24720 // RegNodeBase = EntryEBP - RegNodeSize
24721 // ParentFP = RegNodeBase - ParentFrameOffset
24722 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
24723 DAG.getConstant(RegNodeSize, dl, PtrVT));
24724 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
24725}
24726
24727SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
24728 SelectionDAG &DAG) const {
24729 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
24730 auto isRoundModeCurDirection = [](SDValue Rnd) {
24731 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
24732 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
24733
24734 return false;
24735 };
24736 auto isRoundModeSAE = [](SDValue Rnd) {
24737 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
24738 unsigned RC = C->getZExtValue();
24739 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
24740 // Clear the NO_EXC bit and check remaining bits.
24741 RC ^= X86::STATIC_ROUNDING::NO_EXC;
24742 // As a convenience we allow no other bits or explicitly
24743 // current direction.
24744 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
24745 }
24746 }
24747
24748 return false;
24749 };
24750 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
24751 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
24752 RC = C->getZExtValue();
24753 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
24754 // Clear the NO_EXC bit and check remaining bits.
24755 RC ^= X86::STATIC_ROUNDING::NO_EXC;
24756 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
24757 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
24758 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
24759 RC == X86::STATIC_ROUNDING::TO_ZERO;
24760 }
24761 }
24762
24763 return false;
24764 };
24765
24766 SDLoc dl(Op);
24767 unsigned IntNo = Op.getConstantOperandVal(0);
24768 MVT VT = Op.getSimpleValueType();
24769 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
24770
24771 if (IntrData) {
24772 switch(IntrData->Type) {
24773 case INTR_TYPE_1OP: {
24774 // We specify 2 possible opcodes for intrinsics with rounding modes.
24775 // First, we check if the intrinsic may have non-default rounding mode,
24776 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
24777 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
24778 if (IntrWithRoundingModeOpcode != 0) {
24779 SDValue Rnd = Op.getOperand(2);
24780 unsigned RC = 0;
24781 if (isRoundModeSAEToX(Rnd, RC))
24782 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
24783 Op.getOperand(1),
24784 DAG.getTargetConstant(RC, dl, MVT::i32));
24785 if (!isRoundModeCurDirection(Rnd))
24786 return SDValue();
24787 }
24788 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24789 Op.getOperand(1));
24790 }
24791 case INTR_TYPE_1OP_SAE: {
24792 SDValue Sae = Op.getOperand(2);
24793
24794 unsigned Opc;
24795 if (isRoundModeCurDirection(Sae))
24796 Opc = IntrData->Opc0;
24797 else if (isRoundModeSAE(Sae))
24798 Opc = IntrData->Opc1;
24799 else
24800 return SDValue();
24801
24802 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
24803 }
24804 case INTR_TYPE_2OP: {
24805 SDValue Src2 = Op.getOperand(2);
24806
24807 // We specify 2 possible opcodes for intrinsics with rounding modes.
24808 // First, we check if the intrinsic may have non-default rounding mode,
24809 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
24810 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
24811 if (IntrWithRoundingModeOpcode != 0) {
24812 SDValue Rnd = Op.getOperand(3);
24813 unsigned RC = 0;
24814 if (isRoundModeSAEToX(Rnd, RC))
24815 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
24816 Op.getOperand(1), Src2,
24817 DAG.getTargetConstant(RC, dl, MVT::i32));
24818 if (!isRoundModeCurDirection(Rnd))
24819 return SDValue();
24820 }
24821
24822 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24823 Op.getOperand(1), Src2);
24824 }
24825 case INTR_TYPE_2OP_SAE: {
24826 SDValue Sae = Op.getOperand(3);
24827
24828 unsigned Opc;
24829 if (isRoundModeCurDirection(Sae))
24830 Opc = IntrData->Opc0;
24831 else if (isRoundModeSAE(Sae))
24832 Opc = IntrData->Opc1;
24833 else
24834 return SDValue();
24835
24836 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
24837 Op.getOperand(2));
24838 }
24839 case INTR_TYPE_3OP:
24840 case INTR_TYPE_3OP_IMM8: {
24841 SDValue Src1 = Op.getOperand(1);
24842 SDValue Src2 = Op.getOperand(2);
24843 SDValue Src3 = Op.getOperand(3);
24844
24845 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
24846 Src3.getValueType() != MVT::i8) {
24847 Src3 = DAG.getTargetConstant(
24848 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
24849 }
24850
24851 // We specify 2 possible opcodes for intrinsics with rounding modes.
24852 // First, we check if the intrinsic may have non-default rounding mode,
24853 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
24854 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
24855 if (IntrWithRoundingModeOpcode != 0) {
24856 SDValue Rnd = Op.getOperand(4);
24857 unsigned RC = 0;
24858 if (isRoundModeSAEToX(Rnd, RC))
24859 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
24860 Src1, Src2, Src3,
24861 DAG.getTargetConstant(RC, dl, MVT::i32));
24862 if (!isRoundModeCurDirection(Rnd))
24863 return SDValue();
24864 }
24865
24866 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24867 {Src1, Src2, Src3});
24868 }
24869 case INTR_TYPE_4OP_IMM8: {
24870 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)((Op.getOperand(4)->getOpcode() == ISD::TargetConstant) ? static_cast
<void> (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24870, __PRETTY_FUNCTION__))
;
24871 SDValue Src4 = Op.getOperand(4);
24872 if (Src4.getValueType() != MVT::i8) {
24873 Src4 = DAG.getTargetConstant(
24874 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
24875 }
24876
24877 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24878 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
24879 Src4);
24880 }
24881 case INTR_TYPE_1OP_MASK: {
24882 SDValue Src = Op.getOperand(1);
24883 SDValue PassThru = Op.getOperand(2);
24884 SDValue Mask = Op.getOperand(3);
24885 // We add rounding mode to the Node when
24886 // - RC Opcode is specified and
24887 // - RC is not "current direction".
24888 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
24889 if (IntrWithRoundingModeOpcode != 0) {
24890 SDValue Rnd = Op.getOperand(4);
24891 unsigned RC = 0;
24892 if (isRoundModeSAEToX(Rnd, RC))
24893 return getVectorMaskingNode(
24894 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
24895 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
24896 Mask, PassThru, Subtarget, DAG);
24897 if (!isRoundModeCurDirection(Rnd))
24898 return SDValue();
24899 }
24900 return getVectorMaskingNode(
24901 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
24902 Subtarget, DAG);
24903 }
24904 case INTR_TYPE_1OP_MASK_SAE: {
24905 SDValue Src = Op.getOperand(1);
24906 SDValue PassThru = Op.getOperand(2);
24907 SDValue Mask = Op.getOperand(3);
24908 SDValue Rnd = Op.getOperand(4);
24909
24910 unsigned Opc;
24911 if (isRoundModeCurDirection(Rnd))
24912 Opc = IntrData->Opc0;
24913 else if (isRoundModeSAE(Rnd))
24914 Opc = IntrData->Opc1;
24915 else
24916 return SDValue();
24917
24918 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
24919 Subtarget, DAG);
24920 }
24921 case INTR_TYPE_SCALAR_MASK: {
24922 SDValue Src1 = Op.getOperand(1);
24923 SDValue Src2 = Op.getOperand(2);
24924 SDValue passThru = Op.getOperand(3);
24925 SDValue Mask = Op.getOperand(4);
24926 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
24927 // There are 2 kinds of intrinsics in this group:
24928 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
24929 // (2) With rounding mode and sae - 7 operands.
24930 bool HasRounding = IntrWithRoundingModeOpcode != 0;
24931 if (Op.getNumOperands() == (5U + HasRounding)) {
24932 if (HasRounding) {
24933 SDValue Rnd = Op.getOperand(5);
24934 unsigned RC = 0;
24935 if (isRoundModeSAEToX(Rnd, RC))
24936 return getScalarMaskingNode(
24937 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
24938 DAG.getTargetConstant(RC, dl, MVT::i32)),
24939 Mask, passThru, Subtarget, DAG);
24940 if (!isRoundModeCurDirection(Rnd))
24941 return SDValue();
24942 }
24943 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
24944 Src2),
24945 Mask, passThru, Subtarget, DAG);
24946 }
24947
24948 assert(Op.getNumOperands() == (6U + HasRounding) &&((Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"
) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24949, __PRETTY_FUNCTION__))
24949 "Unexpected intrinsic form")((Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"
) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24949, __PRETTY_FUNCTION__))
;
24950 SDValue RoundingMode = Op.getOperand(5);
24951 unsigned Opc = IntrData->Opc0;
24952 if (HasRounding) {
24953 SDValue Sae = Op.getOperand(6);
24954 if (isRoundModeSAE(Sae))
24955 Opc = IntrWithRoundingModeOpcode;
24956 else if (!isRoundModeCurDirection(Sae))
24957 return SDValue();
24958 }
24959 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
24960 Src2, RoundingMode),
24961 Mask, passThru, Subtarget, DAG);
24962 }
24963 case INTR_TYPE_SCALAR_MASK_RND: {
24964 SDValue Src1 = Op.getOperand(1);
24965 SDValue Src2 = Op.getOperand(2);
24966 SDValue passThru = Op.getOperand(3);
24967 SDValue Mask = Op.getOperand(4);
24968 SDValue Rnd = Op.getOperand(5);
24969
24970 SDValue NewOp;
24971 unsigned RC = 0;
24972 if (isRoundModeCurDirection(Rnd))
24973 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
24974 else if (isRoundModeSAEToX(Rnd, RC))
24975 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
24976 DAG.getTargetConstant(RC, dl, MVT::i32));
24977 else
24978 return SDValue();
24979
24980 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
24981 }
24982 case INTR_TYPE_SCALAR_MASK_SAE: {
24983 SDValue Src1 = Op.getOperand(1);
24984 SDValue Src2 = Op.getOperand(2);
24985 SDValue passThru = Op.getOperand(3);
24986 SDValue Mask = Op.getOperand(4);
24987 SDValue Sae = Op.getOperand(5);
24988 unsigned Opc;
24989 if (isRoundModeCurDirection(Sae))
24990 Opc = IntrData->Opc0;
24991 else if (isRoundModeSAE(Sae))
24992 Opc = IntrData->Opc1;
24993 else
24994 return SDValue();
24995
24996 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
24997 Mask, passThru, Subtarget, DAG);
24998 }
24999 case INTR_TYPE_2OP_MASK: {
25000 SDValue Src1 = Op.getOperand(1);
25001 SDValue Src2 = Op.getOperand(2);
25002 SDValue PassThru = Op.getOperand(3);
25003 SDValue Mask = Op.getOperand(4);
25004 SDValue NewOp;
25005 if (IntrData->Opc1 != 0) {
25006 SDValue Rnd = Op.getOperand(5);
25007 unsigned RC = 0;
25008 if (isRoundModeSAEToX(Rnd, RC))
25009 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25010 DAG.getTargetConstant(RC, dl, MVT::i32));
25011 else if (!isRoundModeCurDirection(Rnd))
25012 return SDValue();
25013 }
25014 if (!NewOp)
25015 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25016 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25017 }
25018 case INTR_TYPE_2OP_MASK_SAE: {
25019 SDValue Src1 = Op.getOperand(1);
25020 SDValue Src2 = Op.getOperand(2);
25021 SDValue PassThru = Op.getOperand(3);
25022 SDValue Mask = Op.getOperand(4);
25023
25024 unsigned Opc = IntrData->Opc0;
25025 if (IntrData->Opc1 != 0) {
25026 SDValue Sae = Op.getOperand(5);
25027 if (isRoundModeSAE(Sae))
25028 Opc = IntrData->Opc1;
25029 else if (!isRoundModeCurDirection(Sae))
25030 return SDValue();
25031 }
25032
25033 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25034 Mask, PassThru, Subtarget, DAG);
25035 }
25036 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
25037 SDValue Src1 = Op.getOperand(1);
25038 SDValue Src2 = Op.getOperand(2);
25039 SDValue Src3 = Op.getOperand(3);
25040 SDValue PassThru = Op.getOperand(4);
25041 SDValue Mask = Op.getOperand(5);
25042 SDValue Sae = Op.getOperand(6);
25043 unsigned Opc;
25044 if (isRoundModeCurDirection(Sae))
25045 Opc = IntrData->Opc0;
25046 else if (isRoundModeSAE(Sae))
25047 Opc = IntrData->Opc1;
25048 else
25049 return SDValue();
25050
25051 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25052 Mask, PassThru, Subtarget, DAG);
25053 }
25054 case INTR_TYPE_3OP_MASK_SAE: {
25055 SDValue Src1 = Op.getOperand(1);
25056 SDValue Src2 = Op.getOperand(2);
25057 SDValue Src3 = Op.getOperand(3);
25058 SDValue PassThru = Op.getOperand(4);
25059 SDValue Mask = Op.getOperand(5);
25060
25061 unsigned Opc = IntrData->Opc0;
25062 if (IntrData->Opc1 != 0) {
25063 SDValue Sae = Op.getOperand(6);
25064 if (isRoundModeSAE(Sae))
25065 Opc = IntrData->Opc1;
25066 else if (!isRoundModeCurDirection(Sae))
25067 return SDValue();
25068 }
25069 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25070 Mask, PassThru, Subtarget, DAG);
25071 }
25072 case BLENDV: {
25073 SDValue Src1 = Op.getOperand(1);
25074 SDValue Src2 = Op.getOperand(2);
25075 SDValue Src3 = Op.getOperand(3);
25076
25077 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
25078 Src3 = DAG.getBitcast(MaskVT, Src3);
25079
25080 // Reverse the operands to match VSELECT order.
25081 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25082 }
25083 case VPERM_2OP : {
25084 SDValue Src1 = Op.getOperand(1);
25085 SDValue Src2 = Op.getOperand(2);
25086
25087 // Swap Src1 and Src2 in the node creation
25088 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25089 }
25090 case IFMA_OP:
25091 // NOTE: We need to swizzle the operands to pass the multiply operands
25092 // first.
25093 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25094 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25095 case FPCLASSS: {
25096 SDValue Src1 = Op.getOperand(1);
25097 SDValue Imm = Op.getOperand(2);
25098 SDValue Mask = Op.getOperand(3);
25099 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25100 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25101 Subtarget, DAG);
25102 // Need to fill with zeros to ensure the bitcast will produce zeroes
25103 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25104 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25105 DAG.getConstant(0, dl, MVT::v8i1),
25106 FPclassMask, DAG.getIntPtrConstant(0, dl));
25107 return DAG.getBitcast(MVT::i8, Ins);
25108 }
25109
25110 case CMP_MASK_CC: {
25111 MVT MaskVT = Op.getSimpleValueType();
25112 SDValue CC = Op.getOperand(3);
25113 SDValue Mask = Op.getOperand(4);
25114 // We specify 2 possible opcodes for intrinsics with rounding modes.
25115 // First, we check if the intrinsic may have non-default rounding mode,
25116 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25117 if (IntrData->Opc1 != 0) {
25118 SDValue Sae = Op.getOperand(5);
25119 if (isRoundModeSAE(Sae))
25120 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25121 Op.getOperand(2), CC, Mask, Sae);
25122 if (!isRoundModeCurDirection(Sae))
25123 return SDValue();
25124 }
25125 //default rounding mode
25126 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25127 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25128 }
25129 case CMP_MASK_SCALAR_CC: {
25130 SDValue Src1 = Op.getOperand(1);
25131 SDValue Src2 = Op.getOperand(2);
25132 SDValue CC = Op.getOperand(3);
25133 SDValue Mask = Op.getOperand(4);
25134
25135 SDValue Cmp;
25136 if (IntrData->Opc1 != 0) {
25137 SDValue Sae = Op.getOperand(5);
25138 if (isRoundModeSAE(Sae))
25139 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25140 else if (!isRoundModeCurDirection(Sae))
25141 return SDValue();
25142 }
25143 //default rounding mode
25144 if (!Cmp.getNode())
25145 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25146
25147 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25148 Subtarget, DAG);
25149 // Need to fill with zeros to ensure the bitcast will produce zeroes
25150 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25151 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25152 DAG.getConstant(0, dl, MVT::v8i1),
25153 CmpMask, DAG.getIntPtrConstant(0, dl));
25154 return DAG.getBitcast(MVT::i8, Ins);
25155 }
25156 case COMI: { // Comparison intrinsics
25157 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25158 SDValue LHS = Op.getOperand(1);
25159 SDValue RHS = Op.getOperand(2);
25160 // Some conditions require the operands to be swapped.
25161 if (CC == ISD::SETLT || CC == ISD::SETLE)
25162 std::swap(LHS, RHS);
25163
25164 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25165 SDValue SetCC;
25166 switch (CC) {
25167 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25168 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25169 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25170 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25171 break;
25172 }
25173 case ISD::SETNE: { // (ZF = 1 or PF = 1)
25174 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25175 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25176 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25177 break;
25178 }
25179 case ISD::SETGT: // (CF = 0 and ZF = 0)
25180 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25181 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25182 break;
25183 }
25184 case ISD::SETGE: // CF = 0
25185 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25186 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25187 break;
25188 default:
25189 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25189)
;
25190 }
25191 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25192 }
25193 case COMI_RM: { // Comparison intrinsics with Sae
25194 SDValue LHS = Op.getOperand(1);
25195 SDValue RHS = Op.getOperand(2);
25196 unsigned CondVal = Op.getConstantOperandVal(3);
25197 SDValue Sae = Op.getOperand(4);
25198
25199 SDValue FCmp;
25200 if (isRoundModeCurDirection(Sae))
25201 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25202 DAG.getTargetConstant(CondVal, dl, MVT::i8));
25203 else if (isRoundModeSAE(Sae))
25204 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25205 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25206 else
25207 return SDValue();
25208 // Need to fill with zeros to ensure the bitcast will produce zeroes
25209 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25210 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25211 DAG.getConstant(0, dl, MVT::v16i1),
25212 FCmp, DAG.getIntPtrConstant(0, dl));
25213 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25214 DAG.getBitcast(MVT::i16, Ins));
25215 }
25216 case VSHIFT:
25217 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25218 Op.getOperand(1), Op.getOperand(2), Subtarget,
25219 DAG);
25220 case COMPRESS_EXPAND_IN_REG: {
25221 SDValue Mask = Op.getOperand(3);
25222 SDValue DataToCompress = Op.getOperand(1);
25223 SDValue PassThru = Op.getOperand(2);
25224 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25225 return Op.getOperand(1);
25226
25227 // Avoid false dependency.
25228 if (PassThru.isUndef())
25229 PassThru = DAG.getConstant(0, dl, VT);
25230
25231 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25232 Mask);
25233 }
25234 case FIXUPIMM:
25235 case FIXUPIMM_MASKZ: {
25236 SDValue Src1 = Op.getOperand(1);
25237 SDValue Src2 = Op.getOperand(2);
25238 SDValue Src3 = Op.getOperand(3);
25239 SDValue Imm = Op.getOperand(4);
25240 SDValue Mask = Op.getOperand(5);
25241 SDValue Passthru = (IntrData->Type == FIXUPIMM)
25242 ? Src1
25243 : getZeroVector(VT, Subtarget, DAG, dl);
25244
25245 unsigned Opc = IntrData->Opc0;
25246 if (IntrData->Opc1 != 0) {
25247 SDValue Sae = Op.getOperand(6);
25248 if (isRoundModeSAE(Sae))
25249 Opc = IntrData->Opc1;
25250 else if (!isRoundModeCurDirection(Sae))
25251 return SDValue();
25252 }
25253
25254 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25255
25256 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25257 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25258
25259 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25260 }
25261 case ROUNDP: {
25262 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")((IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25262, __PRETTY_FUNCTION__))
;
25263 // Clear the upper bits of the rounding immediate so that the legacy
25264 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25265 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
25266 SDValue RoundingMode =
25267 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25268 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25269 Op.getOperand(1), RoundingMode);
25270 }
25271 case ROUNDS: {
25272 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")((IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25272, __PRETTY_FUNCTION__))
;
25273 // Clear the upper bits of the rounding immediate so that the legacy
25274 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25275 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
25276 SDValue RoundingMode =
25277 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25278 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25279 Op.getOperand(1), Op.getOperand(2), RoundingMode);
25280 }
25281 case BEXTRI: {
25282 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")((IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25282, __PRETTY_FUNCTION__))
;
25283
25284 uint64_t Imm = Op.getConstantOperandVal(2);
25285 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
25286 Op.getValueType());
25287 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25288 Op.getOperand(1), Control);
25289 }
25290 // ADC/ADCX/SBB
25291 case ADX: {
25292 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
25293 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
25294
25295 SDValue Res;
25296 // If the carry in is zero, then we should just use ADD/SUB instead of
25297 // ADC/SBB.
25298 if (isNullConstant(Op.getOperand(1))) {
25299 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
25300 Op.getOperand(3));
25301 } else {
25302 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
25303 DAG.getConstant(-1, dl, MVT::i8));
25304 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
25305 Op.getOperand(3), GenCF.getValue(1));
25306 }
25307 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
25308 SDValue Results[] = { SetCC, Res };
25309 return DAG.getMergeValues(Results, dl);
25310 }
25311 case CVTPD2PS_MASK:
25312 case CVTPD2DQ_MASK:
25313 case CVTQQ2PS_MASK:
25314 case TRUNCATE_TO_REG: {
25315 SDValue Src = Op.getOperand(1);
25316 SDValue PassThru = Op.getOperand(2);
25317 SDValue Mask = Op.getOperand(3);
25318
25319 if (isAllOnesConstant(Mask))
25320 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25321
25322 MVT SrcVT = Src.getSimpleValueType();
25323 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25324 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25325 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
25326 {Src, PassThru, Mask});
25327 }
25328 case CVTPS2PH_MASK: {
25329 SDValue Src = Op.getOperand(1);
25330 SDValue Rnd = Op.getOperand(2);
25331 SDValue PassThru = Op.getOperand(3);
25332 SDValue Mask = Op.getOperand(4);
25333
25334 if (isAllOnesConstant(Mask))
25335 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
25336
25337 MVT SrcVT = Src.getSimpleValueType();
25338 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25339 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25340 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
25341 PassThru, Mask);
25342
25343 }
25344 case CVTNEPS2BF16_MASK: {
25345 SDValue Src = Op.getOperand(1);
25346 SDValue PassThru = Op.getOperand(2);
25347 SDValue Mask = Op.getOperand(3);
25348
25349 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25350 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25351
25352 // Break false dependency.
25353 if (PassThru.isUndef())
25354 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
25355
25356 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
25357 Mask);
25358 }
25359 default:
25360 break;
25361 }
25362 }
25363
25364 switch (IntNo) {
25365 default: return SDValue(); // Don't custom lower most intrinsics.
25366
25367 // ptest and testp intrinsics. The intrinsic these come from are designed to
25368 // return an integer value, not just an instruction so lower it to the ptest
25369 // or testp pattern and a setcc for the result.
25370 case Intrinsic::x86_avx512_ktestc_b:
25371 case Intrinsic::x86_avx512_ktestc_w:
25372 case Intrinsic::x86_avx512_ktestc_d:
25373 case Intrinsic::x86_avx512_ktestc_q:
25374 case Intrinsic::x86_avx512_ktestz_b:
25375 case Intrinsic::x86_avx512_ktestz_w:
25376 case Intrinsic::x86_avx512_ktestz_d:
25377 case Intrinsic::x86_avx512_ktestz_q:
25378 case Intrinsic::x86_sse41_ptestz:
25379 case Intrinsic::x86_sse41_ptestc:
25380 case Intrinsic::x86_sse41_ptestnzc:
25381 case Intrinsic::x86_avx_ptestz_256:
25382 case Intrinsic::x86_avx_ptestc_256:
25383 case Intrinsic::x86_avx_ptestnzc_256:
25384 case Intrinsic::x86_avx_vtestz_ps:
25385 case Intrinsic::x86_avx_vtestc_ps:
25386 case Intrinsic::x86_avx_vtestnzc_ps:
25387 case Intrinsic::x86_avx_vtestz_pd:
25388 case Intrinsic::x86_avx_vtestc_pd:
25389 case Intrinsic::x86_avx_vtestnzc_pd:
25390 case Intrinsic::x86_avx_vtestz_ps_256:
25391 case Intrinsic::x86_avx_vtestc_ps_256:
25392 case Intrinsic::x86_avx_vtestnzc_ps_256:
25393 case Intrinsic::x86_avx_vtestz_pd_256:
25394 case Intrinsic::x86_avx_vtestc_pd_256:
25395 case Intrinsic::x86_avx_vtestnzc_pd_256: {
25396 unsigned TestOpc = X86ISD::PTEST;
25397 X86::CondCode X86CC;
25398 switch (IntNo) {
25399 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25399)
;
25400 case Intrinsic::x86_avx512_ktestc_b:
25401 case Intrinsic::x86_avx512_ktestc_w:
25402 case Intrinsic::x86_avx512_ktestc_d:
25403 case Intrinsic::x86_avx512_ktestc_q:
25404 // CF = 1
25405 TestOpc = X86ISD::KTEST;
25406 X86CC = X86::COND_B;
25407 break;
25408 case Intrinsic::x86_avx512_ktestz_b:
25409 case Intrinsic::x86_avx512_ktestz_w:
25410 case Intrinsic::x86_avx512_ktestz_d:
25411 case Intrinsic::x86_avx512_ktestz_q:
25412 TestOpc = X86ISD::KTEST;
25413 X86CC = X86::COND_E;
25414 break;
25415 case Intrinsic::x86_avx_vtestz_ps:
25416 case Intrinsic::x86_avx_vtestz_pd:
25417 case Intrinsic::x86_avx_vtestz_ps_256:
25418 case Intrinsic::x86_avx_vtestz_pd_256:
25419 TestOpc = X86ISD::TESTP;
25420 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25421 case Intrinsic::x86_sse41_ptestz:
25422 case Intrinsic::x86_avx_ptestz_256:
25423 // ZF = 1
25424 X86CC = X86::COND_E;
25425 break;
25426 case Intrinsic::x86_avx_vtestc_ps:
25427 case Intrinsic::x86_avx_vtestc_pd:
25428 case Intrinsic::x86_avx_vtestc_ps_256:
25429 case Intrinsic::x86_avx_vtestc_pd_256:
25430 TestOpc = X86ISD::TESTP;
25431 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25432 case Intrinsic::x86_sse41_ptestc:
25433 case Intrinsic::x86_avx_ptestc_256:
25434 // CF = 1
25435 X86CC = X86::COND_B;
25436 break;
25437 case Intrinsic::x86_avx_vtestnzc_ps:
25438 case Intrinsic::x86_avx_vtestnzc_pd:
25439 case Intrinsic::x86_avx_vtestnzc_ps_256:
25440 case Intrinsic::x86_avx_vtestnzc_pd_256:
25441 TestOpc = X86ISD::TESTP;
25442 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25443 case Intrinsic::x86_sse41_ptestnzc:
25444 case Intrinsic::x86_avx_ptestnzc_256:
25445 // ZF and CF = 0
25446 X86CC = X86::COND_A;
25447 break;
25448 }
25449
25450 SDValue LHS = Op.getOperand(1);
25451 SDValue RHS = Op.getOperand(2);
25452 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
25453 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
25454 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25455 }
25456
25457 case Intrinsic::x86_sse42_pcmpistria128:
25458 case Intrinsic::x86_sse42_pcmpestria128:
25459 case Intrinsic::x86_sse42_pcmpistric128:
25460 case Intrinsic::x86_sse42_pcmpestric128:
25461 case Intrinsic::x86_sse42_pcmpistrio128:
25462 case Intrinsic::x86_sse42_pcmpestrio128:
25463 case Intrinsic::x86_sse42_pcmpistris128:
25464 case Intrinsic::x86_sse42_pcmpestris128:
25465 case Intrinsic::x86_sse42_pcmpistriz128:
25466 case Intrinsic::x86_sse42_pcmpestriz128: {
25467 unsigned Opcode;
25468 X86::CondCode X86CC;
25469 switch (IntNo) {
25470 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25470)
; // Can't reach here.
25471 case Intrinsic::x86_sse42_pcmpistria128:
25472 Opcode = X86ISD::PCMPISTR;
25473 X86CC = X86::COND_A;
25474 break;
25475 case Intrinsic::x86_sse42_pcmpestria128:
25476 Opcode = X86ISD::PCMPESTR;
25477 X86CC = X86::COND_A;
25478 break;
25479 case Intrinsic::x86_sse42_pcmpistric128:
25480 Opcode = X86ISD::PCMPISTR;
25481 X86CC = X86::COND_B;
25482 break;
25483 case Intrinsic::x86_sse42_pcmpestric128:
25484 Opcode = X86ISD::PCMPESTR;
25485 X86CC = X86::COND_B;
25486 break;
25487 case Intrinsic::x86_sse42_pcmpistrio128:
25488 Opcode = X86ISD::PCMPISTR;
25489 X86CC = X86::COND_O;
25490 break;
25491 case Intrinsic::x86_sse42_pcmpestrio128:
25492 Opcode = X86ISD::PCMPESTR;
25493 X86CC = X86::COND_O;
25494 break;
25495 case Intrinsic::x86_sse42_pcmpistris128:
25496 Opcode = X86ISD::PCMPISTR;
25497 X86CC = X86::COND_S;
25498 break;
25499 case Intrinsic::x86_sse42_pcmpestris128:
25500 Opcode = X86ISD::PCMPESTR;
25501 X86CC = X86::COND_S;
25502 break;
25503 case Intrinsic::x86_sse42_pcmpistriz128:
25504 Opcode = X86ISD::PCMPISTR;
25505 X86CC = X86::COND_E;
25506 break;
25507 case Intrinsic::x86_sse42_pcmpestriz128:
25508 Opcode = X86ISD::PCMPESTR;
25509 X86CC = X86::COND_E;
25510 break;
25511 }
25512 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25513 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25514 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
25515 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
25516 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25517 }
25518
25519 case Intrinsic::x86_sse42_pcmpistri128:
25520 case Intrinsic::x86_sse42_pcmpestri128: {
25521 unsigned Opcode;
25522 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
25523 Opcode = X86ISD::PCMPISTR;
25524 else
25525 Opcode = X86ISD::PCMPESTR;
25526
25527 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25528 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25529 return DAG.getNode(Opcode, dl, VTs, NewOps);
25530 }
25531
25532 case Intrinsic::x86_sse42_pcmpistrm128:
25533 case Intrinsic::x86_sse42_pcmpestrm128: {
25534 unsigned Opcode;
25535 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
25536 Opcode = X86ISD::PCMPISTR;
25537 else
25538 Opcode = X86ISD::PCMPESTR;
25539
25540 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25541 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25542 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
25543 }
25544
25545 case Intrinsic::eh_sjlj_lsda: {
25546 MachineFunction &MF = DAG.getMachineFunction();
25547 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25548 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25549 auto &Context = MF.getMMI().getContext();
25550 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
25551 Twine(MF.getFunctionNumber()));
25552 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
25553 DAG.getMCSymbol(S, PtrVT));
25554 }
25555
25556 case Intrinsic::x86_seh_lsda: {
25557 // Compute the symbol for the LSDA. We know it'll get emitted later.
25558 MachineFunction &MF = DAG.getMachineFunction();
25559 SDValue Op1 = Op.getOperand(1);
25560 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
25561 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
25562 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25563
25564 // Generate a simple absolute symbol reference. This intrinsic is only
25565 // supported on 32-bit Windows, which isn't PIC.
25566 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
25567 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
25568 }
25569
25570 case Intrinsic::eh_recoverfp: {
25571 SDValue FnOp = Op.getOperand(1);
25572 SDValue IncomingFPOp = Op.getOperand(2);
25573 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
25574 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
25575 if (!Fn)
25576 report_fatal_error(
25577 "llvm.eh.recoverfp must take a function as the first argument");
25578 return recoverFramePointer(DAG, Fn, IncomingFPOp);
25579 }
25580
25581 case Intrinsic::localaddress: {
25582 // Returns one of the stack, base, or frame pointer registers, depending on
25583 // which is used to reference local variables.
25584 MachineFunction &MF = DAG.getMachineFunction();
25585 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25586 unsigned Reg;
25587 if (RegInfo->hasBasePointer(MF))
25588 Reg = RegInfo->getBaseRegister();
25589 else { // Handles the SP or FP case.
25590 bool CantUseFP = RegInfo->needsStackRealignment(MF);
25591 if (CantUseFP)
25592 Reg = RegInfo->getPtrSizedStackRegister(MF);
25593 else
25594 Reg = RegInfo->getPtrSizedFrameRegister(MF);
25595 }
25596 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
25597 }
25598
25599 case Intrinsic::x86_avx512_vp2intersect_q_512:
25600 case Intrinsic::x86_avx512_vp2intersect_q_256:
25601 case Intrinsic::x86_avx512_vp2intersect_q_128:
25602 case Intrinsic::x86_avx512_vp2intersect_d_512:
25603 case Intrinsic::x86_avx512_vp2intersect_d_256:
25604 case Intrinsic::x86_avx512_vp2intersect_d_128: {
25605 MVT MaskVT = Op.getSimpleValueType();
25606
25607 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
25608 SDLoc DL(Op);
25609
25610 SDValue Operation =
25611 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
25612 Op->getOperand(1), Op->getOperand(2));
25613
25614 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
25615 MaskVT, Operation);
25616 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
25617 MaskVT, Operation);
25618 return DAG.getMergeValues({Result0, Result1}, DL);
25619 }
25620 case Intrinsic::x86_mmx_pslli_w:
25621 case Intrinsic::x86_mmx_pslli_d:
25622 case Intrinsic::x86_mmx_pslli_q:
25623 case Intrinsic::x86_mmx_psrli_w:
25624 case Intrinsic::x86_mmx_psrli_d:
25625 case Intrinsic::x86_mmx_psrli_q:
25626 case Intrinsic::x86_mmx_psrai_w:
25627 case Intrinsic::x86_mmx_psrai_d: {
25628 SDLoc DL(Op);
25629 SDValue ShAmt = Op.getOperand(2);
25630 // If the argument is a constant, convert it to a target constant.
25631 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
25632 // Clamp out of bounds shift amounts since they will otherwise be masked
25633 // to 8-bits which may make it no longer out of bounds.
25634 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
25635 if (ShiftAmount == 0)
25636 return Op.getOperand(1);
25637
25638 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
25639 Op.getOperand(0), Op.getOperand(1),
25640 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
25641 }
25642
25643 unsigned NewIntrinsic;
25644 switch (IntNo) {
25645 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25645)
; // Can't reach here.
25646 case Intrinsic::x86_mmx_pslli_w:
25647 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
25648 break;
25649 case Intrinsic::x86_mmx_pslli_d:
25650 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
25651 break;
25652 case Intrinsic::x86_mmx_pslli_q:
25653 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
25654 break;
25655 case Intrinsic::x86_mmx_psrli_w:
25656 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
25657 break;
25658 case Intrinsic::x86_mmx_psrli_d:
25659 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
25660 break;
25661 case Intrinsic::x86_mmx_psrli_q:
25662 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
25663 break;
25664 case Intrinsic::x86_mmx_psrai_w:
25665 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
25666 break;
25667 case Intrinsic::x86_mmx_psrai_d:
25668 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
25669 break;
25670 }
25671
25672 // The vector shift intrinsics with scalars uses 32b shift amounts but
25673 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
25674 // MMX register.
25675 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
25676 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
25677 DAG.getTargetConstant(NewIntrinsic, DL,
25678 getPointerTy(DAG.getDataLayout())),
25679 Op.getOperand(1), ShAmt);
25680 }
25681 }
25682}
25683
25684static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
25685 SDValue Src, SDValue Mask, SDValue Base,
25686 SDValue Index, SDValue ScaleOp, SDValue Chain,
25687 const X86Subtarget &Subtarget) {
25688 SDLoc dl(Op);
25689 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
25690 // Scale must be constant.
25691 if (!C)
25692 return SDValue();
25693 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25694 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
25695 TLI.getPointerTy(DAG.getDataLayout()));
25696 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
25697 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
25698 // If source is undef or we know it won't be used, use a zero vector
25699 // to break register dependency.
25700 // TODO: use undef instead and let BreakFalseDeps deal with it?
25701 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
25702 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
25703
25704 // Cast mask to an integer type.
25705 Mask = DAG.getBitcast(MaskVT, Mask);
25706
25707 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
25708
25709 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
25710 SDValue Res =
25711 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
25712 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
25713 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
25714}
25715
25716static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
25717 SDValue Src, SDValue Mask, SDValue Base,
25718 SDValue Index, SDValue ScaleOp, SDValue Chain,
25719 const X86Subtarget &Subtarget) {
25720 MVT VT = Op.getSimpleValueType();
25721 SDLoc dl(Op);
25722 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
25723 // Scale must be constant.
25724 if (!C)
25725 return SDValue();
25726 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25727 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
25728 TLI.getPointerTy(DAG.getDataLayout()));
25729 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
25730 VT.getVectorNumElements());
25731 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
25732
25733 // We support two versions of the gather intrinsics. One with scalar mask and
25734 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
25735 if (Mask.getValueType() != MaskVT)
25736 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25737
25738 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
25739 // If source is undef or we know it won't be used, use a zero vector
25740 // to break register dependency.
25741 // TODO: use undef instead and let BreakFalseDeps deal with it?
25742 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
25743 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
25744
25745 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
25746
25747 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
25748 SDValue Res =
25749 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
25750 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
25751 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
25752}
25753
25754static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
25755 SDValue Src, SDValue Mask, SDValue Base,
25756 SDValue Index, SDValue ScaleOp, SDValue Chain,
25757 const X86Subtarget &Subtarget) {
25758 SDLoc dl(Op);
25759 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
25760 // Scale must be constant.
25761 if (!C)
25762 return SDValue();
25763 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25764 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
25765 TLI.getPointerTy(DAG.getDataLayout()));
25766 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
25767 Src.getSimpleValueType().getVectorNumElements());
25768 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
25769
25770 // We support two versions of the scatter intrinsics. One with scalar mask and
25771 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
25772 if (Mask.getValueType() != MaskVT)
25773 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25774
25775 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
25776
25777 SDVTList VTs = DAG.getVTList(MVT::Other);
25778 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
25779 SDValue Res =
25780 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
25781 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
25782 return Res;
25783}
25784
25785static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
25786 SDValue Mask, SDValue Base, SDValue Index,
25787 SDValue ScaleOp, SDValue Chain,
25788 const X86Subtarget &Subtarget) {
25789 SDLoc dl(Op);
25790 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
25791 // Scale must be constant.
25792 if (!C)
25793 return SDValue();
25794 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25795 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
25796 TLI.getPointerTy(DAG.getDataLayout()));
25797 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
25798 SDValue Segment = DAG.getRegister(0, MVT::i32);
25799 MVT MaskVT =
25800 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
25801 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25802 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
25803 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
25804 return SDValue(Res, 0);
25805}
25806
25807/// Handles the lowering of builtin intrinsics with chain that return their
25808/// value into registers EDX:EAX.
25809/// If operand ScrReg is a valid register identifier, then operand 2 of N is
25810/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
25811/// TargetOpcode.
25812/// Returns a Glue value which can be used to add extra copy-from-reg if the
25813/// expanded intrinsics implicitly defines extra registers (i.e. not just
25814/// EDX:EAX).
25815static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
25816 SelectionDAG &DAG,
25817 unsigned TargetOpcode,
25818 unsigned SrcReg,
25819 const X86Subtarget &Subtarget,
25820 SmallVectorImpl<SDValue> &Results) {
25821 SDValue Chain = N->getOperand(0);
25822 SDValue Glue;
25823
25824 if (SrcReg) {
25825 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((N->getNumOperands() == 3 && "Unexpected number of operands!"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25825, __PRETTY_FUNCTION__))
;
25826 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
25827 Glue = Chain.getValue(1);
25828 }
25829
25830 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
25831 SDValue N1Ops[] = {Chain, Glue};
25832 SDNode *N1 = DAG.getMachineNode(
25833 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
25834 Chain = SDValue(N1, 0);
25835
25836 // Reads the content of XCR and returns it in registers EDX:EAX.
25837 SDValue LO, HI;
25838 if (Subtarget.is64Bit()) {
25839 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
25840 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
25841 LO.getValue(2));
25842 } else {
25843 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
25844 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
25845 LO.getValue(2));
25846 }
25847 Chain = HI.getValue(1);
25848 Glue = HI.getValue(2);
25849
25850 if (Subtarget.is64Bit()) {
25851 // Merge the two 32-bit values into a 64-bit one.
25852 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
25853 DAG.getConstant(32, DL, MVT::i8));
25854 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
25855 Results.push_back(Chain);
25856 return Glue;
25857 }
25858
25859 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
25860 SDValue Ops[] = { LO, HI };
25861 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
25862 Results.push_back(Pair);
25863 Results.push_back(Chain);
25864 return Glue;
25865}
25866
25867/// Handles the lowering of builtin intrinsics that read the time stamp counter
25868/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
25869/// READCYCLECOUNTER nodes.
25870static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
25871 SelectionDAG &DAG,
25872 const X86Subtarget &Subtarget,
25873 SmallVectorImpl<SDValue> &Results) {
25874 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
25875 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
25876 // and the EAX register is loaded with the low-order 32 bits.
25877 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
25878 /* NoRegister */0, Subtarget,
25879 Results);
25880 if (Opcode != X86::RDTSCP)
25881 return;
25882
25883 SDValue Chain = Results[1];
25884 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
25885 // the ECX register. Add 'ecx' explicitly to the chain.
25886 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
25887 Results[1] = ecx;
25888 Results.push_back(ecx.getValue(1));
25889}
25890
25891static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
25892 SelectionDAG &DAG) {
25893 SmallVector<SDValue, 3> Results;
25894 SDLoc DL(Op);
25895 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
25896 Results);
25897 return DAG.getMergeValues(Results, DL);
25898}
25899
25900static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
25901 MachineFunction &MF = DAG.getMachineFunction();
25902 SDValue Chain = Op.getOperand(0);
25903 SDValue RegNode = Op.getOperand(2);
25904 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
25905 if (!EHInfo)
25906 report_fatal_error("EH registrations only live in functions using WinEH");
25907
25908 // Cast the operand to an alloca, and remember the frame index.
25909 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
25910 if (!FINode)
25911 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
25912 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
25913
25914 // Return the chain operand without making any DAG nodes.
25915 return Chain;
25916}
25917
25918static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
25919 MachineFunction &MF = DAG.getMachineFunction();
25920 SDValue Chain = Op.getOperand(0);
25921 SDValue EHGuard = Op.getOperand(2);
25922 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
25923 if (!EHInfo)
25924 report_fatal_error("EHGuard only live in functions using WinEH");
25925
25926 // Cast the operand to an alloca, and remember the frame index.
25927 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
25928 if (!FINode)
25929 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
25930 EHInfo->EHGuardFrameIndex = FINode->getIndex();
25931
25932 // Return the chain operand without making any DAG nodes.
25933 return Chain;
25934}
25935
25936/// Emit Truncating Store with signed or unsigned saturation.
25937static SDValue
25938EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
25939 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
25940 SelectionDAG &DAG) {
25941 SDVTList VTs = DAG.getVTList(MVT::Other);
25942 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
25943 SDValue Ops[] = { Chain, Val, Ptr, Undef };
25944 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
25945 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
25946}
25947
25948/// Emit Masked Truncating Store with signed or unsigned saturation.
25949static SDValue
25950EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
25951 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
25952 MachineMemOperand *MMO, SelectionDAG &DAG) {
25953 SDVTList VTs = DAG.getVTList(MVT::Other);
25954 SDValue Ops[] = { Chain, Val, Ptr, Mask };
25955 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
25956 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
25957}
25958
25959static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
25960 SelectionDAG &DAG) {
25961 unsigned IntNo = Op.getConstantOperandVal(1);
25962 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
25963 if (!IntrData) {
25964 switch (IntNo) {
25965 case llvm::Intrinsic::x86_seh_ehregnode:
25966 return MarkEHRegistrationNode(Op, DAG);
25967 case llvm::Intrinsic::x86_seh_ehguard:
25968 return MarkEHGuard(Op, DAG);
25969 case llvm::Intrinsic::x86_rdpkru: {
25970 SDLoc dl(Op);
25971 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
25972 // Create a RDPKRU node and pass 0 to the ECX parameter.
25973 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
25974 DAG.getConstant(0, dl, MVT::i32));
25975 }
25976 case llvm::Intrinsic::x86_wrpkru: {
25977 SDLoc dl(Op);
25978 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
25979 // to the EDX and ECX parameters.
25980 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
25981 Op.getOperand(0), Op.getOperand(2),
25982 DAG.getConstant(0, dl, MVT::i32),
25983 DAG.getConstant(0, dl, MVT::i32));
25984 }
25985 case llvm::Intrinsic::x86_flags_read_u32:
25986 case llvm::Intrinsic::x86_flags_read_u64:
25987 case llvm::Intrinsic::x86_flags_write_u32:
25988 case llvm::Intrinsic::x86_flags_write_u64: {
25989 // We need a frame pointer because this will get lowered to a PUSH/POP
25990 // sequence.
25991 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
25992 MFI.setHasCopyImplyingStackAdjustment(true);
25993 // Don't do anything here, we will expand these intrinsics out later
25994 // during FinalizeISel in EmitInstrWithCustomInserter.
25995 return Op;
25996 }
25997 case Intrinsic::x86_lwpins32:
25998 case Intrinsic::x86_lwpins64:
25999 case Intrinsic::x86_umwait:
26000 case Intrinsic::x86_tpause: {
26001 SDLoc dl(Op);
26002 SDValue Chain = Op->getOperand(0);
26003 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26004 unsigned Opcode;
26005
26006 switch (IntNo) {
26007 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26007)
;
26008 case Intrinsic::x86_umwait:
26009 Opcode = X86ISD::UMWAIT;
26010 break;
26011 case Intrinsic::x86_tpause:
26012 Opcode = X86ISD::TPAUSE;
26013 break;
26014 case Intrinsic::x86_lwpins32:
26015 case Intrinsic::x86_lwpins64:
26016 Opcode = X86ISD::LWPINS;
26017 break;
26018 }
26019
26020 SDValue Operation =
26021 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26022 Op->getOperand(3), Op->getOperand(4));
26023 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26024 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26025 Operation.getValue(1));
26026 }
26027 case Intrinsic::x86_enqcmd:
26028 case Intrinsic::x86_enqcmds: {
26029 SDLoc dl(Op);
26030 SDValue Chain = Op.getOperand(0);
26031 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26032 unsigned Opcode;
26033 switch (IntNo) {
26034 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26034)
;
26035 case Intrinsic::x86_enqcmd:
26036 Opcode = X86ISD::ENQCMD;
26037 break;
26038 case Intrinsic::x86_enqcmds:
26039 Opcode = X86ISD::ENQCMDS;
26040 break;
26041 }
26042 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26043 Op.getOperand(3));
26044 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26045 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26046 Operation.getValue(1));
26047 }
26048 case Intrinsic::x86_aesenc128kl:
26049 case Intrinsic::x86_aesdec128kl:
26050 case Intrinsic::x86_aesenc256kl:
26051 case Intrinsic::x86_aesdec256kl: {
26052 SDLoc DL(Op);
26053 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26054 SDValue Chain = Op.getOperand(0);
26055 unsigned Opcode;
26056
26057 switch (IntNo) {
26058 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26058)
;
26059 case Intrinsic::x86_aesenc128kl:
26060 Opcode = X86ISD::AESENC128KL;
26061 break;
26062 case Intrinsic::x86_aesdec128kl:
26063 Opcode = X86ISD::AESDEC128KL;
26064 break;
26065 case Intrinsic::x86_aesenc256kl:
26066 Opcode = X86ISD::AESENC256KL;
26067 break;
26068 case Intrinsic::x86_aesdec256kl:
26069 Opcode = X86ISD::AESDEC256KL;
26070 break;
26071 }
26072
26073 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26074 MachineMemOperand *MMO = MemIntr->getMemOperand();
26075 EVT MemVT = MemIntr->getMemoryVT();
26076 SDValue Operation = DAG.getMemIntrinsicNode(
26077 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26078 MMO);
26079 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26080
26081 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26082 {ZF, Operation.getValue(0), Operation.getValue(2)});
26083 }
26084 case Intrinsic::x86_aesencwide128kl:
26085 case Intrinsic::x86_aesdecwide128kl:
26086 case Intrinsic::x86_aesencwide256kl:
26087 case Intrinsic::x86_aesdecwide256kl: {
26088 SDLoc DL(Op);
26089 SDVTList VTs = DAG.getVTList(
26090 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26091 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26092 SDValue Chain = Op.getOperand(0);
26093 unsigned Opcode;
26094
26095 switch (IntNo) {
26096 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26096)
;
26097 case Intrinsic::x86_aesencwide128kl:
26098 Opcode = X86ISD::AESENCWIDE128KL;
26099 break;
26100 case Intrinsic::x86_aesdecwide128kl:
26101 Opcode = X86ISD::AESDECWIDE128KL;
26102 break;
26103 case Intrinsic::x86_aesencwide256kl:
26104 Opcode = X86ISD::AESENCWIDE256KL;
26105 break;
26106 case Intrinsic::x86_aesdecwide256kl:
26107 Opcode = X86ISD::AESDECWIDE256KL;
26108 break;
26109 }
26110
26111 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26112 MachineMemOperand *MMO = MemIntr->getMemOperand();
26113 EVT MemVT = MemIntr->getMemoryVT();
26114 SDValue Operation = DAG.getMemIntrinsicNode(
26115 Opcode, DL, VTs,
26116 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26117 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26118 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26119 MemVT, MMO);
26120 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26121
26122 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26123 {ZF, Operation.getValue(1), Operation.getValue(2),
26124 Operation.getValue(3), Operation.getValue(4),
26125 Operation.getValue(5), Operation.getValue(6),
26126 Operation.getValue(7), Operation.getValue(8),
26127 Operation.getValue(9)});
26128 }
26129 case Intrinsic::x86_testui: {
26130 SDLoc dl(Op);
26131 SDValue Chain = Op.getOperand(0);
26132 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26133 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26134 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26135 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26136 Operation.getValue(1));
26137 }
26138 }
26139 return SDValue();
26140 }
26141
26142 SDLoc dl(Op);
26143 switch(IntrData->Type) {
26144 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26144)
;
26145 case RDSEED:
26146 case RDRAND: {
26147 // Emit the node with the right value type.
26148 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
26149 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26150
26151 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
26152 // Otherwise return the value from Rand, which is always 0, casted to i32.
26153 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
26154 DAG.getConstant(1, dl, Op->getValueType(1)),
26155 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
26156 SDValue(Result.getNode(), 1)};
26157 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
26158
26159 // Return { result, isValid, chain }.
26160 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
26161 SDValue(Result.getNode(), 2));
26162 }
26163 case GATHER_AVX2: {
26164 SDValue Chain = Op.getOperand(0);
26165 SDValue Src = Op.getOperand(2);
26166 SDValue Base = Op.getOperand(3);
26167 SDValue Index = Op.getOperand(4);
26168 SDValue Mask = Op.getOperand(5);
26169 SDValue Scale = Op.getOperand(6);
26170 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26171 Scale, Chain, Subtarget);
26172 }
26173 case GATHER: {
26174 //gather(v1, mask, index, base, scale);
26175 SDValue Chain = Op.getOperand(0);
26176 SDValue Src = Op.getOperand(2);
26177 SDValue Base = Op.getOperand(3);
26178 SDValue Index = Op.getOperand(4);
26179 SDValue Mask = Op.getOperand(5);
26180 SDValue Scale = Op.getOperand(6);
26181 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
26182 Chain, Subtarget);
26183 }
26184 case SCATTER: {
26185 //scatter(base, mask, index, v1, scale);
26186 SDValue Chain = Op.getOperand(0);
26187 SDValue Base = Op.getOperand(2);
26188 SDValue Mask = Op.getOperand(3);
26189 SDValue Index = Op.getOperand(4);
26190 SDValue Src = Op.getOperand(5);
26191 SDValue Scale = Op.getOperand(6);
26192 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26193 Scale, Chain, Subtarget);
26194 }
26195 case PREFETCH: {
26196 const APInt &HintVal = Op.getConstantOperandAPInt(6);
26197 assert((HintVal == 2 || HintVal == 3) &&(((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"
) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26198, __PRETTY_FUNCTION__))
26198 "Wrong prefetch hint in intrinsic: should be 2 or 3")(((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"
) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26198, __PRETTY_FUNCTION__))
;
26199 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
26200 SDValue Chain = Op.getOperand(0);
26201 SDValue Mask = Op.getOperand(2);
26202 SDValue Index = Op.getOperand(3);
26203 SDValue Base = Op.getOperand(4);
26204 SDValue Scale = Op.getOperand(5);
26205 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
26206 Subtarget);
26207 }
26208 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
26209 case RDTSC: {
26210 SmallVector<SDValue, 2> Results;
26211 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
26212 Results);
26213 return DAG.getMergeValues(Results, dl);
26214 }
26215 // Read Performance Monitoring Counters.
26216 case RDPMC:
26217 // GetExtended Control Register.
26218 case XGETBV: {
26219 SmallVector<SDValue, 2> Results;
26220
26221 // RDPMC uses ECX to select the index of the performance counter to read.
26222 // XGETBV uses ECX to select the index of the XCR register to return.
26223 // The result is stored into registers EDX:EAX.
26224 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
26225 Subtarget, Results);
26226 return DAG.getMergeValues(Results, dl);
26227 }
26228 // XTEST intrinsics.
26229 case XTEST: {
26230 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
26231 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26232
26233 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
26234 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
26235 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
26236 Ret, SDValue(InTrans.getNode(), 1));
26237 }
26238 case TRUNCATE_TO_MEM_VI8:
26239 case TRUNCATE_TO_MEM_VI16:
26240 case TRUNCATE_TO_MEM_VI32: {
26241 SDValue Mask = Op.getOperand(4);
26242 SDValue DataToTruncate = Op.getOperand(3);
26243 SDValue Addr = Op.getOperand(2);
26244 SDValue Chain = Op.getOperand(0);
26245
26246 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
26247 assert(MemIntr && "Expected MemIntrinsicSDNode!")((MemIntr && "Expected MemIntrinsicSDNode!") ? static_cast
<void> (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26247, __PRETTY_FUNCTION__))
;
26248
26249 EVT MemVT = MemIntr->getMemoryVT();
26250
26251 uint16_t TruncationOp = IntrData->Opc0;
26252 switch (TruncationOp) {
26253 case X86ISD::VTRUNC: {
26254 if (isAllOnesConstant(Mask)) // return just a truncate store
26255 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
26256 MemIntr->getMemOperand());
26257
26258 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26259 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26260 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
26261
26262 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
26263 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
26264 true /* truncating */);
26265 }
26266 case X86ISD::VTRUNCUS:
26267 case X86ISD::VTRUNCS: {
26268 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
26269 if (isAllOnesConstant(Mask))
26270 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
26271 MemIntr->getMemOperand(), DAG);
26272
26273 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26274 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26275
26276 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
26277 VMask, MemVT, MemIntr->getMemOperand(), DAG);
26278 }
26279 default:
26280 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26280)
;
26281 }
26282 }
26283 }
26284}
26285
26286SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
26287 SelectionDAG &DAG) const {
26288 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26289 MFI.setReturnAddressIsTaken(true);
26290
26291 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
26292 return SDValue();
26293
26294 unsigned Depth = Op.getConstantOperandVal(0);
26295 SDLoc dl(Op);
26296 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26297
26298 if (Depth > 0) {
26299 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
26300 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26301 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
26302 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26303 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
26304 MachinePointerInfo());
26305 }
26306
26307 // Just load the return address.
26308 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
26309 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
26310 MachinePointerInfo());
26311}
26312
26313SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
26314 SelectionDAG &DAG) const {
26315 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
26316 return getReturnAddressFrameIndex(DAG);
26317}
26318
26319SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
26320 MachineFunction &MF = DAG.getMachineFunction();
26321 MachineFrameInfo &MFI = MF.getFrameInfo();
26322 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
26323 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26324 EVT VT = Op.getValueType();
26325
26326 MFI.setFrameAddressIsTaken(true);
26327
26328 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
26329 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
26330 // is not possible to crawl up the stack without looking at the unwind codes
26331 // simultaneously.
26332 int FrameAddrIndex = FuncInfo->getFAIndex();
26333 if (!FrameAddrIndex) {
26334 // Set up a frame object for the return address.
26335 unsigned SlotSize = RegInfo->getSlotSize();
26336 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
26337 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
26338 FuncInfo->setFAIndex(FrameAddrIndex);
26339 }
26340 return DAG.getFrameIndex(FrameAddrIndex, VT);
26341 }
26342
26343 unsigned FrameReg =
26344 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
26345 SDLoc dl(Op); // FIXME probably not meaningful
26346 unsigned Depth = Op.getConstantOperandVal(0);
26347 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26349, __PRETTY_FUNCTION__))
26348 (FrameReg == X86::EBP && VT == MVT::i32)) &&((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26349, __PRETTY_FUNCTION__))
26349 "Invalid Frame Register!")((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26349, __PRETTY_FUNCTION__))
;
26350 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
26351 while (Depth--)
26352 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
26353 MachinePointerInfo());
26354 return FrameAddr;
26355}
26356
26357// FIXME? Maybe this could be a TableGen attribute on some registers and
26358// this table could be generated automatically from RegInfo.
26359Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
26360 const MachineFunction &MF) const {
26361 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
26362
26363 Register Reg = StringSwitch<unsigned>(RegName)
26364 .Case("esp", X86::ESP)
26365 .Case("rsp", X86::RSP)
26366 .Case("ebp", X86::EBP)
26367 .Case("rbp", X86::RBP)
26368 .Default(0);
26369
26370 if (Reg == X86::EBP || Reg == X86::RBP) {
26371 if (!TFI.hasFP(MF))
26372 report_fatal_error("register " + StringRef(RegName) +
26373 " is allocatable: function has no frame pointer");
26374#ifndef NDEBUG
26375 else {
26376 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26377 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
26378 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26379, __PRETTY_FUNCTION__))
26379 "Invalid Frame Register!")(((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26379, __PRETTY_FUNCTION__))
;
26380 }
26381#endif
26382 }
26383
26384 if (Reg)
26385 return Reg;
26386
26387 report_fatal_error("Invalid register name global variable");
26388}
26389
26390SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
26391 SelectionDAG &DAG) const {
26392 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26393 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
26394}
26395
26396Register X86TargetLowering::getExceptionPointerRegister(
26397 const Constant *PersonalityFn) const {
26398 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
26399 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26400
26401 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
26402}
26403
26404Register X86TargetLowering::getExceptionSelectorRegister(
26405 const Constant *PersonalityFn) const {
26406 // Funclet personalities don't use selectors (the runtime does the selection).
26407 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))((!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn
))) ? static_cast<void> (0) : __assert_fail ("!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26407, __PRETTY_FUNCTION__))
;
26408 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26409}
26410
26411bool X86TargetLowering::needsFixedCatchObjects() const {
26412 return Subtarget.isTargetWin64();
26413}
26414
26415SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
26416 SDValue Chain = Op.getOperand(0);
26417 SDValue Offset = Op.getOperand(1);
26418 SDValue Handler = Op.getOperand(2);
26419 SDLoc dl (Op);
26420
26421 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26422 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26423 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
26424 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26426, __PRETTY_FUNCTION__))
26425 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26426, __PRETTY_FUNCTION__))
26426 "Invalid Frame Register!")((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26426, __PRETTY_FUNCTION__))
;
26427 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
26428 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
26429
26430 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
26431 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
26432 dl));
26433 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
26434 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
26435 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
26436
26437 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
26438 DAG.getRegister(StoreAddrReg, PtrVT));
26439}
26440
26441SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
26442 SelectionDAG &DAG) const {
26443 SDLoc DL(Op);
26444 // If the subtarget is not 64bit, we may need the global base reg
26445 // after isel expand pseudo, i.e., after CGBR pass ran.
26446 // Therefore, ask for the GlobalBaseReg now, so that the pass
26447 // inserts the code for us in case we need it.
26448 // Otherwise, we will end up in a situation where we will
26449 // reference a virtual register that is not defined!
26450 if (!Subtarget.is64Bit()) {
26451 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26452 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
26453 }
26454 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
26455 DAG.getVTList(MVT::i32, MVT::Other),
26456 Op.getOperand(0), Op.getOperand(1));
26457}
26458
26459SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
26460 SelectionDAG &DAG) const {
26461 SDLoc DL(Op);
26462 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
26463 Op.getOperand(0), Op.getOperand(1));
26464}
26465
26466SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
26467 SelectionDAG &DAG) const {
26468 SDLoc DL(Op);
26469 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
26470 Op.getOperand(0));
26471}
26472
26473static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
26474 return Op.getOperand(0);
26475}
26476
26477SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
26478 SelectionDAG &DAG) const {
26479 SDValue Root = Op.getOperand(0);
26480 SDValue Trmp = Op.getOperand(1); // trampoline
26481 SDValue FPtr = Op.getOperand(2); // nested function
26482 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
26483 SDLoc dl (Op);
26484
26485 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26486 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26487
26488 if (Subtarget.is64Bit()) {
26489 SDValue OutChains[6];
26490
26491 // Large code-model.
26492 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
26493 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
26494
26495 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
26496 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
26497
26498 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
26499
26500 // Load the pointer to the nested function into R11.
26501 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
26502 SDValue Addr = Trmp;
26503 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26504 Addr, MachinePointerInfo(TrmpAddr));
26505
26506 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26507 DAG.getConstant(2, dl, MVT::i64));
26508 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
26509 MachinePointerInfo(TrmpAddr, 2), Align(2));
26510
26511 // Load the 'nest' parameter value into R10.
26512 // R10 is specified in X86CallingConv.td
26513 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
26514 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26515 DAG.getConstant(10, dl, MVT::i64));
26516 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26517 Addr, MachinePointerInfo(TrmpAddr, 10));
26518
26519 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26520 DAG.getConstant(12, dl, MVT::i64));
26521 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
26522 MachinePointerInfo(TrmpAddr, 12), Align(2));
26523
26524 // Jump to the nested function.
26525 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
26526 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26527 DAG.getConstant(20, dl, MVT::i64));
26528 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26529 Addr, MachinePointerInfo(TrmpAddr, 20));
26530
26531 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
26532 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26533 DAG.getConstant(22, dl, MVT::i64));
26534 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
26535 Addr, MachinePointerInfo(TrmpAddr, 22));
26536
26537 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
26538 } else {
26539 const Function *Func =
26540 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
26541 CallingConv::ID CC = Func->getCallingConv();
26542 unsigned NestReg;
26543
26544 switch (CC) {
26545 default:
26546 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26546)
;
26547 case CallingConv::C:
26548 case CallingConv::X86_StdCall: {
26549 // Pass 'nest' parameter in ECX.
26550 // Must be kept in sync with X86CallingConv.td
26551 NestReg = X86::ECX;
26552
26553 // Check that ECX wasn't needed by an 'inreg' parameter.
26554 FunctionType *FTy = Func->getFunctionType();
26555 const AttributeList &Attrs = Func->getAttributes();
26556
26557 if (!Attrs.isEmpty() && !Func->isVarArg()) {
26558 unsigned InRegCount = 0;
26559 unsigned Idx = 1;
26560
26561 for (FunctionType::param_iterator I = FTy->param_begin(),
26562 E = FTy->param_end(); I != E; ++I, ++Idx)
26563 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
26564 const DataLayout &DL = DAG.getDataLayout();
26565 // FIXME: should only count parameters that are lowered to integers.
26566 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
26567 }
26568
26569 if (InRegCount > 2) {
26570 report_fatal_error("Nest register in use - reduce number of inreg"
26571 " parameters!");
26572 }
26573 }
26574 break;
26575 }
26576 case CallingConv::X86_FastCall:
26577 case CallingConv::X86_ThisCall:
26578 case CallingConv::Fast:
26579 case CallingConv::Tail:
26580 // Pass 'nest' parameter in EAX.
26581 // Must be kept in sync with X86CallingConv.td
26582 NestReg = X86::EAX;
26583 break;
26584 }
26585
26586 SDValue OutChains[4];
26587 SDValue Addr, Disp;
26588
26589 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26590 DAG.getConstant(10, dl, MVT::i32));
26591 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
26592
26593 // This is storing the opcode for MOV32ri.
26594 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
26595 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
26596 OutChains[0] =
26597 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
26598 Trmp, MachinePointerInfo(TrmpAddr));
26599
26600 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26601 DAG.getConstant(1, dl, MVT::i32));
26602 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
26603 MachinePointerInfo(TrmpAddr, 1), Align(1));
26604
26605 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
26606 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26607 DAG.getConstant(5, dl, MVT::i32));
26608 OutChains[2] =
26609 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
26610 MachinePointerInfo(TrmpAddr, 5), Align(1));
26611
26612 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
26613 DAG.getConstant(6, dl, MVT::i32));
26614 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
26615 MachinePointerInfo(TrmpAddr, 6), Align(1));
26616
26617 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
26618 }
26619}
26620
26621SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
26622 SelectionDAG &DAG) const {
26623 /*
26624 The rounding mode is in bits 11:10 of FPSR, and has the following
26625 settings:
26626 00 Round to nearest
26627 01 Round to -inf
26628 10 Round to +inf
26629 11 Round to 0
26630
26631 FLT_ROUNDS, on the other hand, expects the following:
26632 -1 Undefined
26633 0 Round to 0
26634 1 Round to nearest
26635 2 Round to +inf
26636 3 Round to -inf
26637
26638 To perform the conversion, we use a packed lookup table of the four 2-bit
26639 values that we can index by FPSP[11:10]
26640 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
26641
26642 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
26643 */
26644
26645 MachineFunction &MF = DAG.getMachineFunction();
26646 MVT VT = Op.getSimpleValueType();
26647 SDLoc DL(Op);
26648
26649 // Save FP Control Word to stack slot
26650 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
26651 SDValue StackSlot =
26652 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
26653
26654 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
26655
26656 SDValue Chain = Op.getOperand(0);
26657 SDValue Ops[] = {Chain, StackSlot};
26658 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
26659 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
26660 Align(2), MachineMemOperand::MOStore);
26661
26662 // Load FP Control Word from stack slot
26663 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
26664 Chain = CWD.getValue(1);
26665
26666 // Mask and turn the control bits into a shift for the lookup table.
26667 SDValue Shift =
26668 DAG.getNode(ISD::SRL, DL, MVT::i16,
26669 DAG.getNode(ISD::AND, DL, MVT::i16,
26670 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
26671 DAG.getConstant(9, DL, MVT::i8));
26672 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
26673
26674 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
26675 SDValue RetVal =
26676 DAG.getNode(ISD::AND, DL, MVT::i32,
26677 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
26678 DAG.getConstant(3, DL, MVT::i32));
26679
26680 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
26681
26682 return DAG.getMergeValues({RetVal, Chain}, DL);
26683}
26684
26685/// Lower a vector CTLZ using native supported vector CTLZ instruction.
26686//
26687// i8/i16 vector implemented using dword LZCNT vector instruction
26688// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
26689// split the vector, perform operation on it's Lo a Hi part and
26690// concatenate the results.
26691static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
26692 const X86Subtarget &Subtarget) {
26693 assert(Op.getOpcode() == ISD::CTLZ)((Op.getOpcode() == ISD::CTLZ) ? static_cast<void> (0) :
__assert_fail ("Op.getOpcode() == ISD::CTLZ", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26693, __PRETTY_FUNCTION__))
;
26694 SDLoc dl(Op);
26695 MVT VT = Op.getSimpleValueType();
26696 MVT EltVT = VT.getVectorElementType();
26697 unsigned NumElems = VT.getVectorNumElements();
26698
26699 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26700, __PRETTY_FUNCTION__))
26700 "Unsupported element type")(((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26700, __PRETTY_FUNCTION__))
;
26701
26702 // Split vector, it's Lo and Hi parts will be handled in next iteration.
26703 if (NumElems > 16 ||
26704 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
26705 return splitVectorIntUnary(Op, DAG);
26706
26707 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
26708 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation") ? static_cast<void
> (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26709, __PRETTY_FUNCTION__))
26709 "Unsupported value type for operation")(((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation") ? static_cast<void
> (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26709, __PRETTY_FUNCTION__))
;
26710
26711 // Use native supported vector instruction vplzcntd.
26712 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
26713 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
26714 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
26715 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
26716
26717 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
26718}
26719
26720// Lower CTLZ using a PSHUFB lookup table implementation.
26721static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
26722 const X86Subtarget &Subtarget,
26723 SelectionDAG &DAG) {
26724 MVT VT = Op.getSimpleValueType();
26725 int NumElts = VT.getVectorNumElements();
26726 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
26727 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
26728
26729 // Per-nibble leading zero PSHUFB lookup table.
26730 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
26731 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
26732 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
26733 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
26734
26735 SmallVector<SDValue, 64> LUTVec;
26736 for (int i = 0; i < NumBytes; ++i)
26737 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
26738 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
26739
26740 // Begin by bitcasting the input to byte vector, then split those bytes
26741 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
26742 // If the hi input nibble is zero then we add both results together, otherwise
26743 // we just take the hi result (by masking the lo result to zero before the
26744 // add).
26745 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
26746 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
26747
26748 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
26749 SDValue Lo = Op0;
26750 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
26751 SDValue HiZ;
26752 if (CurrVT.is512BitVector()) {
26753 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
26754 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
26755 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
26756 } else {
26757 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
26758 }
26759
26760 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
26761 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
26762 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
26763 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
26764
26765 // Merge result back from vXi8 back to VT, working on the lo/hi halves
26766 // of the current vector width in the same way we did for the nibbles.
26767 // If the upper half of the input element is zero then add the halves'
26768 // leading zero counts together, otherwise just use the upper half's.
26769 // Double the width of the result until we are at target width.
26770 while (CurrVT != VT) {
26771 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
26772 int CurrNumElts = CurrVT.getVectorNumElements();
26773 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
26774 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
26775 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
26776
26777 // Check if the upper half of the input element is zero.
26778 if (CurrVT.is512BitVector()) {
26779 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
26780 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
26781 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
26782 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
26783 } else {
26784 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
26785 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
26786 }
26787 HiZ = DAG.getBitcast(NextVT, HiZ);
26788
26789 // Move the upper/lower halves to the lower bits as we'll be extending to
26790 // NextVT. Mask the lower result to zero if HiZ is true and add the results
26791 // together.
26792 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
26793 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
26794 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
26795 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
26796 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
26797 CurrVT = NextVT;
26798 }
26799
26800 return Res;
26801}
26802
26803static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
26804 const X86Subtarget &Subtarget,
26805 SelectionDAG &DAG) {
26806 MVT VT = Op.getSimpleValueType();
26807
26808 if (Subtarget.hasCDI() &&
26809 // vXi8 vectors need to be promoted to 512-bits for vXi32.
26810 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
26811 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
26812
26813 // Decompose 256-bit ops into smaller 128-bit ops.
26814 if (VT.is256BitVector() && !Subtarget.hasInt256())
26815 return splitVectorIntUnary(Op, DAG);
26816
26817 // Decompose 512-bit ops into smaller 256-bit ops.
26818 if (VT.is512BitVector() && !Subtarget.hasBWI())
26819 return splitVectorIntUnary(Op, DAG);
26820
26821 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")((Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26821, __PRETTY_FUNCTION__))
;
26822 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
26823}
26824
26825static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
26826 SelectionDAG &DAG) {
26827 MVT VT = Op.getSimpleValueType();
26828 MVT OpVT = VT;
26829 unsigned NumBits = VT.getSizeInBits();
26830 SDLoc dl(Op);
26831 unsigned Opc = Op.getOpcode();
26832
26833 if (VT.isVector())
26834 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
26835
26836 Op = Op.getOperand(0);
26837 if (VT == MVT::i8) {
26838 // Zero extend to i32 since there is not an i8 bsr.
26839 OpVT = MVT::i32;
26840 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
26841 }
26842
26843 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
26844 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
26845 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
26846
26847 if (Opc == ISD::CTLZ) {
26848 // If src is zero (i.e. bsr sets ZF), returns NumBits.
26849 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
26850 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
26851 Op.getValue(1)};
26852 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
26853 }
26854
26855 // Finally xor with NumBits-1.
26856 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
26857 DAG.getConstant(NumBits - 1, dl, OpVT));
26858
26859 if (VT == MVT::i8)
26860 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
26861 return Op;
26862}
26863
26864static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
26865 SelectionDAG &DAG) {
26866 MVT VT = Op.getSimpleValueType();
26867 unsigned NumBits = VT.getScalarSizeInBits();
26868 SDValue N0 = Op.getOperand(0);
26869 SDLoc dl(Op);
26870
26871 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&((!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? static_cast<
void> (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26872, __PRETTY_FUNCTION__))
26872 "Only scalar CTTZ requires custom lowering")((!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? static_cast<
void> (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26872, __PRETTY_FUNCTION__))
;
26873
26874 // Issue a bsf (scan bits forward) which also sets EFLAGS.
26875 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
26876 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
26877
26878 // If src is zero (i.e. bsf sets ZF), returns NumBits.
26879 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
26880 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
26881 Op.getValue(1)};
26882 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
26883}
26884
26885static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
26886 const X86Subtarget &Subtarget) {
26887 MVT VT = Op.getSimpleValueType();
26888 if (VT == MVT::i16 || VT == MVT::i32)
26889 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
26890
26891 if (VT.getScalarType() == MVT::i1)
26892 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
26893 Op.getOperand(0), Op.getOperand(1));
26894
26895 if (VT == MVT::v32i16 || VT == MVT::v64i8)
26896 return splitVectorIntBinary(Op, DAG);
26897
26898 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26900, __PRETTY_FUNCTION__))
26899 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26900, __PRETTY_FUNCTION__))
26900 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26900, __PRETTY_FUNCTION__))
;
26901 return splitVectorIntBinary(Op, DAG);
26902}
26903
26904static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
26905 const X86Subtarget &Subtarget) {
26906 MVT VT = Op.getSimpleValueType();
26907 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
26908 unsigned Opcode = Op.getOpcode();
26909 SDLoc DL(Op);
26910
26911 if (VT.getScalarType() == MVT::i1) {
26912 switch (Opcode) {
26913 default: llvm_unreachable("Expected saturated arithmetic opcode")::llvm::llvm_unreachable_internal("Expected saturated arithmetic opcode"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26913)
;
26914 case ISD::UADDSAT:
26915 case ISD::SADDSAT:
26916 // *addsat i1 X, Y --> X | Y
26917 return DAG.getNode(ISD::OR, DL, VT, X, Y);
26918 case ISD::USUBSAT:
26919 case ISD::SSUBSAT:
26920 // *subsat i1 X, Y --> X & ~Y
26921 return DAG.getNode(ISD::AND, DL, VT, X, DAG.getNOT(DL, Y, VT));
26922 }
26923 }
26924
26925 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
26926 (VT.is256BitVector() && !Subtarget.hasInt256())) {
26927 assert(Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().isInteger() && "Only handle AVX vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26928, __PRETTY_FUNCTION__))
26928 "Only handle AVX vector integer operation")((Op.getSimpleValueType().isInteger() && "Only handle AVX vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26928, __PRETTY_FUNCTION__))
;
26929 return splitVectorIntBinary(Op, DAG);
26930 }
26931
26932 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
26933 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26934 EVT SetCCResultType =
26935 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
26936
26937 if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
26938 // usubsat X, Y --> (X >u Y) ? X - Y : 0
26939 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
26940 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
26941 // TODO: Move this to DAGCombiner?
26942 if (SetCCResultType == VT &&
26943 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
26944 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
26945 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
26946 }
26947
26948 // Use default expansion.
26949 return SDValue();
26950}
26951
26952static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
26953 SelectionDAG &DAG) {
26954 MVT VT = Op.getSimpleValueType();
26955 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
26956 // Since X86 does not have CMOV for 8-bit integer, we don't convert
26957 // 8-bit integer abs to NEG and CMOV.
26958 SDLoc DL(Op);
26959 SDValue N0 = Op.getOperand(0);
26960 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
26961 DAG.getConstant(0, DL, VT), N0);
26962 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
26963 SDValue(Neg.getNode(), 1)};
26964 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
26965 }
26966
26967 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
26968 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
26969 SDLoc DL(Op);
26970 SDValue Src = Op.getOperand(0);
26971 SDValue Sub =
26972 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
26973 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
26974 }
26975
26976 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
26977 assert(VT.isInteger() &&((VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26978, __PRETTY_FUNCTION__))
26978 "Only handle AVX 256-bit vector integer operation")((VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26978, __PRETTY_FUNCTION__))
;
26979 return splitVectorIntUnary(Op, DAG);
26980 }
26981
26982 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
26983 return splitVectorIntUnary(Op, DAG);
26984
26985 // Default to expand.
26986 return SDValue();
26987}
26988
26989static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
26990 MVT VT = Op.getSimpleValueType();
26991
26992 // For AVX1 cases, split to use legal ops (everything but v4i64).
26993 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
26994 return splitVectorIntBinary(Op, DAG);
26995
26996 if (VT == MVT::v32i16 || VT == MVT::v64i8)
26997 return splitVectorIntBinary(Op, DAG);
26998
26999 // Default to expand.
27000 return SDValue();
27001}
27002
27003static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
27004 SelectionDAG &DAG) {
27005 SDLoc dl(Op);
27006 MVT VT = Op.getSimpleValueType();
27007
27008 if (VT.getScalarType() == MVT::i1)
27009 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
27010
27011 // Decompose 256-bit ops into 128-bit ops.
27012 if (VT.is256BitVector() && !Subtarget.hasInt256())
27013 return splitVectorIntBinary(Op, DAG);
27014
27015 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27016 return splitVectorIntBinary(Op, DAG);
27017
27018 SDValue A = Op.getOperand(0);
27019 SDValue B = Op.getOperand(1);
27020
27021 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
27022 // vector pairs, multiply and truncate.
27023 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
27024 unsigned NumElts = VT.getVectorNumElements();
27025
27026 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27027 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27028 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
27029 return DAG.getNode(
27030 ISD::TRUNCATE, dl, VT,
27031 DAG.getNode(ISD::MUL, dl, ExVT,
27032 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
27033 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
27034 }
27035
27036 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27037
27038 // Extract the lo/hi parts to any extend to i16.
27039 // We're going to mask off the low byte of each result element of the
27040 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
27041 // element.
27042 SDValue Undef = DAG.getUNDEF(VT);
27043 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
27044 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
27045
27046 SDValue BLo, BHi;
27047 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27048 // If the LHS is a constant, manually unpackl/unpackh.
27049 SmallVector<SDValue, 16> LoOps, HiOps;
27050 for (unsigned i = 0; i != NumElts; i += 16) {
27051 for (unsigned j = 0; j != 8; ++j) {
27052 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
27053 MVT::i16));
27054 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
27055 MVT::i16));
27056 }
27057 }
27058
27059 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27060 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27061 } else {
27062 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
27063 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
27064 }
27065
27066 // Multiply, mask the lower 8bits of the lo/hi results and pack.
27067 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
27068 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
27069 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
27070 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
27071 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27072 }
27073
27074 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
27075 if (VT == MVT::v4i32) {
27076 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&((Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27077, __PRETTY_FUNCTION__))
27077 "Should not custom lower when pmulld is available!")((Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27077, __PRETTY_FUNCTION__))
;
27078
27079 // Extract the odd parts.
27080 static const int UnpackMask[] = { 1, -1, 3, -1 };
27081 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
27082 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
27083
27084 // Multiply the even parts.
27085 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27086 DAG.getBitcast(MVT::v2i64, A),
27087 DAG.getBitcast(MVT::v2i64, B));
27088 // Now multiply odd parts.
27089 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27090 DAG.getBitcast(MVT::v2i64, Aodds),
27091 DAG.getBitcast(MVT::v2i64, Bodds));
27092
27093 Evens = DAG.getBitcast(VT, Evens);
27094 Odds = DAG.getBitcast(VT, Odds);
27095
27096 // Merge the two vectors back together with a shuffle. This expands into 2
27097 // shuffles.
27098 static const int ShufMask[] = { 0, 4, 2, 6 };
27099 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
27100 }
27101
27102 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27103, __PRETTY_FUNCTION__))
27103 "Only know how to lower V2I64/V4I64/V8I64 multiply")(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27103, __PRETTY_FUNCTION__))
;
27104 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")((!Subtarget.hasDQI() && "DQI should use MULLQ") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27104, __PRETTY_FUNCTION__))
;
27105
27106 // Ahi = psrlqi(a, 32);
27107 // Bhi = psrlqi(b, 32);
27108 //
27109 // AloBlo = pmuludq(a, b);
27110 // AloBhi = pmuludq(a, Bhi);
27111 // AhiBlo = pmuludq(Ahi, b);
27112 //
27113 // Hi = psllqi(AloBhi + AhiBlo, 32);
27114 // return AloBlo + Hi;
27115 KnownBits AKnown = DAG.computeKnownBits(A);
27116 KnownBits BKnown = DAG.computeKnownBits(B);
27117
27118 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
27119 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
27120 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
27121
27122 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
27123 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
27124 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
27125
27126 SDValue Zero = DAG.getConstant(0, dl, VT);
27127
27128 // Only multiply lo/hi halves that aren't known to be zero.
27129 SDValue AloBlo = Zero;
27130 if (!ALoIsZero && !BLoIsZero)
27131 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
27132
27133 SDValue AloBhi = Zero;
27134 if (!ALoIsZero && !BHiIsZero) {
27135 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
27136 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
27137 }
27138
27139 SDValue AhiBlo = Zero;
27140 if (!AHiIsZero && !BLoIsZero) {
27141 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
27142 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
27143 }
27144
27145 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
27146 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
27147
27148 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
27149}
27150
27151static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
27152 SelectionDAG &DAG) {
27153 SDLoc dl(Op);
27154 MVT VT = Op.getSimpleValueType();
27155 bool IsSigned = Op->getOpcode() == ISD::MULHS;
27156 unsigned NumElts = VT.getVectorNumElements();
27157 SDValue A = Op.getOperand(0);
27158 SDValue B = Op.getOperand(1);
27159
27160 // Decompose 256-bit ops into 128-bit ops.
27161 if (VT.is256BitVector() && !Subtarget.hasInt256())
27162 return splitVectorIntBinary(Op, DAG);
27163
27164 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27165 return splitVectorIntBinary(Op, DAG);
27166
27167 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
27168 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27170, __PRETTY_FUNCTION__))
27169 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27170, __PRETTY_FUNCTION__))
27170 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27170, __PRETTY_FUNCTION__))
;
27171
27172 // PMULxD operations multiply each even value (starting at 0) of LHS with
27173 // the related value of RHS and produce a widen result.
27174 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27175 // => <2 x i64> <ae|cg>
27176 //
27177 // In other word, to have all the results, we need to perform two PMULxD:
27178 // 1. one with the even values.
27179 // 2. one with the odd values.
27180 // To achieve #2, with need to place the odd values at an even position.
27181 //
27182 // Place the odd value at an even position (basically, shift all values 1
27183 // step to the left):
27184 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
27185 9, -1, 11, -1, 13, -1, 15, -1};
27186 // <a|b|c|d> => <b|undef|d|undef>
27187 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
27188 makeArrayRef(&Mask[0], NumElts));
27189 // <e|f|g|h> => <f|undef|h|undef>
27190 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
27191 makeArrayRef(&Mask[0], NumElts));
27192
27193 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
27194 // ints.
27195 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
27196 unsigned Opcode =
27197 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
27198 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27199 // => <2 x i64> <ae|cg>
27200 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27201 DAG.getBitcast(MulVT, A),
27202 DAG.getBitcast(MulVT, B)));
27203 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
27204 // => <2 x i64> <bf|dh>
27205 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27206 DAG.getBitcast(MulVT, Odd0),
27207 DAG.getBitcast(MulVT, Odd1)));
27208
27209 // Shuffle it back into the right order.
27210 SmallVector<int, 16> ShufMask(NumElts);
27211 for (int i = 0; i != (int)NumElts; ++i)
27212 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
27213
27214 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
27215
27216 // If we have a signed multiply but no PMULDQ fix up the result of an
27217 // unsigned multiply.
27218 if (IsSigned && !Subtarget.hasSSE41()) {
27219 SDValue Zero = DAG.getConstant(0, dl, VT);
27220 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
27221 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
27222 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
27223 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
27224
27225 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
27226 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
27227 }
27228
27229 return Res;
27230 }
27231
27232 // Only i8 vectors should need custom lowering after this.
27233 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27235, __PRETTY_FUNCTION__))
27234 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27235, __PRETTY_FUNCTION__))
27235 "Unsupported vector type")(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27235, __PRETTY_FUNCTION__))
;
27236
27237 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
27238 // logical shift down the upper half and pack back to i8.
27239
27240 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
27241 // and then ashr/lshr the upper bits down to the lower bits before multiply.
27242 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27243
27244 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27245 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27246 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27247 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27248 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27249 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27250 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27251 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27252 }
27253
27254 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
27255 // to a vXi16 type. Do the multiplies, shift the results and pack the half
27256 // lane results back together.
27257
27258 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27259
27260 static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
27261 -1, -1, -1, -1, -1, -1, -1, -1};
27262
27263 // Extract the lo parts and zero/sign extend to i16.
27264 // Only use SSE4.1 instructions for signed v16i8 where using unpack requires
27265 // shifts to sign extend. Using unpack for unsigned only requires an xor to
27266 // create zeros and a copy due to tied registers contraints pre-avx. But using
27267 // zero_extend_vector_inreg would require an additional pshufd for the high
27268 // part.
27269
27270 SDValue ALo, AHi;
27271 if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
27272 ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);
27273
27274 AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
27275 AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
27276 } else if (IsSigned) {
27277 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
27278 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));
27279
27280 ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
27281 AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
27282 } else {
27283 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
27284 DAG.getConstant(0, dl, VT)));
27285 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
27286 DAG.getConstant(0, dl, VT)));
27287 }
27288
27289 SDValue BLo, BHi;
27290 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27291 // If the LHS is a constant, manually unpackl/unpackh and extend.
27292 SmallVector<SDValue, 16> LoOps, HiOps;
27293 for (unsigned i = 0; i != NumElts; i += 16) {
27294 for (unsigned j = 0; j != 8; ++j) {
27295 SDValue LoOp = B.getOperand(i + j);
27296 SDValue HiOp = B.getOperand(i + j + 8);
27297
27298 if (IsSigned) {
27299 LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
27300 HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
27301 } else {
27302 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
27303 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
27304 }
27305
27306 LoOps.push_back(LoOp);
27307 HiOps.push_back(HiOp);
27308 }
27309 }
27310
27311 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27312 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27313 } else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
27314 BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);
27315
27316 BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
27317 BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
27318 } else if (IsSigned) {
27319 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
27320 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));
27321
27322 BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
27323 BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
27324 } else {
27325 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
27326 DAG.getConstant(0, dl, VT)));
27327 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
27328 DAG.getConstant(0, dl, VT)));
27329 }
27330
27331 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
27332 // pack back to vXi8.
27333 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
27334 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
27335 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
27336 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
27337
27338 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
27339 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27340}
27341
27342SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
27343 assert(Subtarget.isTargetWin64() && "Unexpected target")((Subtarget.isTargetWin64() && "Unexpected target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27343, __PRETTY_FUNCTION__))
;
27344 EVT VT = Op.getValueType();
27345 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27346, __PRETTY_FUNCTION__))
27346 "Unexpected return type for lowering")((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27346, __PRETTY_FUNCTION__))
;
27347
27348 RTLIB::Libcall LC;
27349 bool isSigned;
27350 switch (Op->getOpcode()) {
27351 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27351)
;
27352 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
27353 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
27354 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
27355 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
27356 }
27357
27358 SDLoc dl(Op);
27359 SDValue InChain = DAG.getEntryNode();
27360
27361 TargetLowering::ArgListTy Args;
27362 TargetLowering::ArgListEntry Entry;
27363 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
27364 EVT ArgVT = Op->getOperand(i).getValueType();
27365 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27366, __PRETTY_FUNCTION__))
27366 "Unexpected argument type for lowering")((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27366, __PRETTY_FUNCTION__))
;
27367 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
27368 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
27369 MachinePointerInfo MPI =
27370 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
27371 Entry.Node = StackPtr;
27372 InChain =
27373 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
27374 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
27375 Entry.Ty = PointerType::get(ArgTy,0);
27376 Entry.IsSExt = false;
27377 Entry.IsZExt = false;
27378 Args.push_back(Entry);
27379 }
27380
27381 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
27382 getPointerTy(DAG.getDataLayout()));
27383
27384 TargetLowering::CallLoweringInfo CLI(DAG);
27385 CLI.setDebugLoc(dl)
27386 .setChain(InChain)
27387 .setLibCallee(
27388 getLibcallCallingConv(LC),
27389 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
27390 std::move(Args))
27391 .setInRegister()
27392 .setSExtResult(isSigned)
27393 .setZExtResult(!isSigned);
27394
27395 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
27396 return DAG.getBitcast(VT, CallInfo.first);
27397}
27398
27399// Return true if the required (according to Opcode) shift-imm form is natively
27400// supported by the Subtarget
27401static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
27402 unsigned Opcode) {
27403 if (VT.getScalarSizeInBits() < 16)
27404 return false;
27405
27406 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
27407 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
27408 return true;
27409
27410 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
27411 (VT.is256BitVector() && Subtarget.hasInt256());
27412
27413 bool AShift = LShift && (Subtarget.hasAVX512() ||
27414 (VT != MVT::v2i64 && VT != MVT::v4i64));
27415 return (Opcode == ISD::SRA) ? AShift : LShift;
27416}
27417
27418// The shift amount is a variable, but it is the same for all vector lanes.
27419// These instructions are defined together with shift-immediate.
27420static
27421bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
27422 unsigned Opcode) {
27423 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
27424}
27425
27426// Return true if the required (according to Opcode) variable-shift form is
27427// natively supported by the Subtarget
27428static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
27429 unsigned Opcode) {
27430
27431 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
27432 return false;
27433
27434 // vXi16 supported only on AVX-512, BWI
27435 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
27436 return false;
27437
27438 if (Subtarget.hasAVX512())
27439 return true;
27440
27441 bool LShift = VT.is128BitVector() || VT.is256BitVector();
27442 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
27443 return (Opcode == ISD::SRA) ? AShift : LShift;
27444}
27445
27446static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
27447 const X86Subtarget &Subtarget) {
27448 MVT VT = Op.getSimpleValueType();
27449 SDLoc dl(Op);
27450 SDValue R = Op.getOperand(0);
27451 SDValue Amt = Op.getOperand(1);
27452 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
27453
27454 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
27455 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27455, __PRETTY_FUNCTION__))
;
27456 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
27457 SDValue Ex = DAG.getBitcast(ExVT, R);
27458
27459 // ashr(R, 63) === cmp_slt(R, 0)
27460 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
27461 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"
) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27462, __PRETTY_FUNCTION__))
27462 "Unsupported PCMPGT op")(((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"
) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27462, __PRETTY_FUNCTION__))
;
27463 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
27464 }
27465
27466 if (ShiftAmt >= 32) {
27467 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
27468 SDValue Upper =
27469 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
27470 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
27471 ShiftAmt - 32, DAG);
27472 if (VT == MVT::v2i64)
27473 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
27474 if (VT == MVT::v4i64)
27475 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
27476 {9, 1, 11, 3, 13, 5, 15, 7});
27477 } else {
27478 // SRA upper i32, SRL whole i64 and select lower i32.
27479 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
27480 ShiftAmt, DAG);
27481 SDValue Lower =
27482 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
27483 Lower = DAG.getBitcast(ExVT, Lower);
27484 if (VT == MVT::v2i64)
27485 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
27486 if (VT == MVT::v4i64)
27487 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
27488 {8, 1, 10, 3, 12, 5, 14, 7});
27489 }
27490 return DAG.getBitcast(VT, Ex);
27491 };
27492
27493 // Optimize shl/srl/sra with constant shift amount.
27494 APInt APIntShiftAmt;
27495 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
27496 return SDValue();
27497
27498 // If the shift amount is out of range, return undef.
27499 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
27500 return DAG.getUNDEF(VT);
27501
27502 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
27503
27504 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
27505 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
27506
27507 // i64 SRA needs to be performed as partial shifts.
27508 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
27509 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
27510 Op.getOpcode() == ISD::SRA)
27511 return ArithmeticShiftRight64(ShiftAmt);
27512
27513 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
27514 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
27515 unsigned NumElts = VT.getVectorNumElements();
27516 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27517
27518 // Simple i8 add case
27519 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
27520 return DAG.getNode(ISD::ADD, dl, VT, R, R);
27521
27522 // ashr(R, 7) === cmp_slt(R, 0)
27523 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
27524 SDValue Zeros = DAG.getConstant(0, dl, VT);
27525 if (VT.is512BitVector()) {
27526 assert(VT == MVT::v64i8 && "Unexpected element type!")((VT == MVT::v64i8 && "Unexpected element type!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27526, __PRETTY_FUNCTION__))
;
27527 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
27528 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
27529 }
27530 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
27531 }
27532
27533 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
27534 if (VT == MVT::v16i8 && Subtarget.hasXOP())
27535 return SDValue();
27536
27537 if (Op.getOpcode() == ISD::SHL) {
27538 // Make a large shift.
27539 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
27540 ShiftAmt, DAG);
27541 SHL = DAG.getBitcast(VT, SHL);
27542 // Zero out the rightmost bits.
27543 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
27544 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
27545 }
27546 if (Op.getOpcode() == ISD::SRL) {
27547 // Make a large shift.
27548 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
27549 ShiftAmt, DAG);
27550 SRL = DAG.getBitcast(VT, SRL);
27551 // Zero out the leftmost bits.
27552 return DAG.getNode(ISD::AND, dl, VT, SRL,
27553 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
27554 }
27555 if (Op.getOpcode() == ISD::SRA) {
27556 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
27557 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
27558
27559 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
27560 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
27561 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
27562 return Res;
27563 }
27564 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27564)
;
27565 }
27566
27567 return SDValue();
27568}
27569
27570static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
27571 const X86Subtarget &Subtarget) {
27572 MVT VT = Op.getSimpleValueType();
27573 SDLoc dl(Op);
27574 SDValue R = Op.getOperand(0);
27575 SDValue Amt = Op.getOperand(1);
27576 unsigned Opcode = Op.getOpcode();
27577 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
27578 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
27579
27580 if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
27581 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
27582 MVT EltVT = VT.getVectorElementType();
27583 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")((EltVT.bitsLE(MVT::i64) && "Unexpected element type!"
) ? static_cast<void> (0) : __assert_fail ("EltVT.bitsLE(MVT::i64) && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27583, __PRETTY_FUNCTION__))
;
27584 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
27585 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
27586 else if (EltVT.bitsLT(MVT::i32))
27587 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
27588
27589 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
27590 }
27591
27592 // vXi8 shifts - shift as v8i16 + mask result.
27593 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
27594 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
27595 VT == MVT::v64i8) &&
27596 !Subtarget.hasXOP()) {
27597 unsigned NumElts = VT.getVectorNumElements();
27598 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27599 if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
27600 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
27601 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
27602 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
27603
27604 // Create the mask using vXi16 shifts. For shift-rights we need to move
27605 // the upper byte down before splatting the vXi8 mask.
27606 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
27607 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
27608 BaseShAmt, Subtarget, DAG);
27609 if (Opcode != ISD::SHL)
27610 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
27611 8, DAG);
27612 BitMask = DAG.getBitcast(VT, BitMask);
27613 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
27614 SmallVector<int, 64>(NumElts, 0));
27615
27616 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
27617 DAG.getBitcast(ExtVT, R), BaseShAmt,
27618 Subtarget, DAG);
27619 Res = DAG.getBitcast(VT, Res);
27620 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
27621
27622 if (Opcode == ISD::SRA) {
27623 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
27624 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
27625 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
27626 SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
27627 BaseShAmt, Subtarget, DAG);
27628 SignMask = DAG.getBitcast(VT, SignMask);
27629 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
27630 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
27631 }
27632 return Res;
27633 }
27634 }
27635 }
27636
27637 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
27638 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
27639 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
27640 Amt = Amt.getOperand(0);
27641 unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
27642 std::vector<SDValue> Vals(Ratio);
27643 for (unsigned i = 0; i != Ratio; ++i)
27644 Vals[i] = Amt.getOperand(i);
27645 for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
27646 for (unsigned j = 0; j != Ratio; ++j)
27647 if (Vals[j] != Amt.getOperand(i + j))
27648 return SDValue();
27649 }
27650
27651 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
27652 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
27653 }
27654 return SDValue();
27655}
27656
27657// Convert a shift/rotate left amount to a multiplication scale factor.
27658static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
27659 const X86Subtarget &Subtarget,
27660 SelectionDAG &DAG) {
27661 MVT VT = Amt.getSimpleValueType();
27662 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
27663 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
27664 (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
27665 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
27666 return SDValue();
27667
27668 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
27669 SmallVector<SDValue, 8> Elts;
27670 MVT SVT = VT.getVectorElementType();
27671 unsigned SVTBits = SVT.getSizeInBits();
27672 APInt One(SVTBits, 1);
27673 unsigned NumElems = VT.getVectorNumElements();
27674
27675 for (unsigned i = 0; i != NumElems; ++i) {
27676 SDValue Op = Amt->getOperand(i);
27677 if (Op->isUndef()) {
27678 Elts.push_back(Op);
27679 continue;
27680 }
27681
27682 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
27683 APInt C(SVTBits, ND->getZExtValue());
27684 uint64_t ShAmt = C.getZExtValue();
27685 if (ShAmt >= SVTBits) {
27686 Elts.push_back(DAG.getUNDEF(SVT));
27687 continue;
27688 }
27689 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
27690 }
27691 return DAG.getBuildVector(VT, dl, Elts);
27692 }
27693
27694 // If the target doesn't support variable shifts, use either FP conversion
27695 // or integer multiplication to avoid shifting each element individually.
27696 if (VT == MVT::v4i32) {
27697 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
27698 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
27699 DAG.getConstant(0x3f800000U, dl, VT));
27700 Amt = DAG.getBitcast(MVT::v4f32, Amt);
27701 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
27702 }
27703
27704 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
27705 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
27706 SDValue Z = DAG.getConstant(0, dl, VT);
27707 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
27708 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
27709 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
27710 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
27711 if (Subtarget.hasSSE41())
27712 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
27713
27714 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
27715 DAG.getBitcast(VT, Hi),
27716 {0, 2, 4, 6, 8, 10, 12, 14});
27717 }
27718
27719 return SDValue();
27720}
27721
27722static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
27723 SelectionDAG &DAG) {
27724 MVT VT = Op.getSimpleValueType();
27725 SDLoc dl(Op);
27726 SDValue R = Op.getOperand(0);
27727 SDValue Amt = Op.getOperand(1);
27728 unsigned EltSizeInBits = VT.getScalarSizeInBits();
27729 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
27730
27731 unsigned Opc = Op.getOpcode();
27732 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
27733 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
27734
27735 assert(VT.isVector() && "Custom lowering only for vector shifts!")((VT.isVector() && "Custom lowering only for vector shifts!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27735, __PRETTY_FUNCTION__))
;
27736 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")((Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27736, __PRETTY_FUNCTION__))
;
27737
27738 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
27739 return V;
27740
27741 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
27742 return V;
27743
27744 if (SupportedVectorVarShift(VT, Subtarget, Opc))
27745 return Op;
27746
27747 // XOP has 128-bit variable logical/arithmetic shifts.
27748 // +ve/-ve Amt = shift left/right.
27749 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
27750 VT == MVT::v8i16 || VT == MVT::v16i8)) {
27751 if (Opc == ISD::SRL || Opc == ISD::SRA) {
27752 SDValue Zero = DAG.getConstant(0, dl, VT);
27753 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
27754 }
27755 if (Opc == ISD::SHL || Opc == ISD::SRL)
27756 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
27757 if (Opc == ISD::SRA)
27758 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
27759 }
27760
27761 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
27762 // shifts per-lane and then shuffle the partial results back together.
27763 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
27764 // Splat the shift amounts so the scalar shifts above will catch it.
27765 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
27766 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
27767 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
27768 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
27769 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
27770 }
27771
27772 // i64 vector arithmetic shift can be emulated with the transform:
27773 // M = lshr(SIGN_MASK, Amt)
27774 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
27775 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
27776 Opc == ISD::SRA) {
27777 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
27778 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
27779 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
27780 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
27781 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
27782 return R;
27783 }
27784
27785 // If possible, lower this shift as a sequence of two shifts by
27786 // constant plus a BLENDing shuffle instead of scalarizing it.
27787 // Example:
27788 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
27789 //
27790 // Could be rewritten as:
27791 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
27792 //
27793 // The advantage is that the two shifts from the example would be
27794 // lowered as X86ISD::VSRLI nodes in parallel before blending.
27795 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
27796 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
27797 SDValue Amt1, Amt2;
27798 unsigned NumElts = VT.getVectorNumElements();
27799 SmallVector<int, 8> ShuffleMask;
27800 for (unsigned i = 0; i != NumElts; ++i) {
27801 SDValue A = Amt->getOperand(i);
27802 if (A.isUndef()) {
27803 ShuffleMask.push_back(SM_SentinelUndef);
27804 continue;
27805 }
27806 if (!Amt1 || Amt1 == A) {
27807 ShuffleMask.push_back(i);
27808 Amt1 = A;
27809 continue;
27810 }
27811 if (!Amt2 || Amt2 == A) {
27812 ShuffleMask.push_back(i + NumElts);
27813 Amt2 = A;
27814 continue;
27815 }
27816 break;
27817 }
27818
27819 // Only perform this blend if we can perform it without loading a mask.
27820 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
27821 (VT != MVT::v16i16 ||
27822 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
27823 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
27824 canWidenShuffleElements(ShuffleMask))) {
27825 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
27826 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
27827 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
27828 Cst2->getAPIntValue().ult(EltSizeInBits)) {
27829 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
27830 Cst1->getZExtValue(), DAG);
27831 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
27832 Cst2->getZExtValue(), DAG);
27833 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
27834 }
27835 }
27836 }
27837
27838 // If possible, lower this packed shift into a vector multiply instead of
27839 // expanding it into a sequence of scalar shifts.
27840 if (Opc == ISD::SHL)
27841 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
27842 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
27843
27844 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
27845 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
27846 if (Opc == ISD::SRL && ConstantAmt &&
27847 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
27848 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
27849 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
27850 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
27851 SDValue Zero = DAG.getConstant(0, dl, VT);
27852 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
27853 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
27854 return DAG.getSelect(dl, VT, ZAmt, R, Res);
27855 }
27856 }
27857
27858 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
27859 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
27860 // TODO: Special case handling for shift by 0/1, really we can afford either
27861 // of these cases in pre-SSE41/XOP/AVX512 but not both.
27862 if (Opc == ISD::SRA && ConstantAmt &&
27863 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
27864 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
27865 !Subtarget.hasAVX512()) ||
27866 DAG.isKnownNeverZero(Amt))) {
27867 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
27868 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
27869 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
27870 SDValue Amt0 =
27871 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
27872 SDValue Amt1 =
27873 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
27874 SDValue Sra1 =
27875 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
27876 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
27877 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
27878 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
27879 }
27880 }
27881
27882 // v4i32 Non Uniform Shifts.
27883 // If the shift amount is constant we can shift each lane using the SSE2
27884 // immediate shifts, else we need to zero-extend each lane to the lower i64
27885 // and shift using the SSE2 variable shifts.
27886 // The separate results can then be blended together.
27887 if (VT == MVT::v4i32) {
27888 SDValue Amt0, Amt1, Amt2, Amt3;
27889 if (ConstantAmt) {
27890 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
27891 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
27892 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
27893 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
27894 } else {
27895 // The SSE2 shifts use the lower i64 as the same shift amount for
27896 // all lanes and the upper i64 is ignored. On AVX we're better off
27897 // just zero-extending, but for SSE just duplicating the top 16-bits is
27898 // cheaper and has the same effect for out of range values.
27899 if (Subtarget.hasAVX()) {
27900 SDValue Z = DAG.getConstant(0, dl, VT);
27901 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
27902 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
27903 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
27904 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
27905 } else {
27906 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
27907 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
27908 {4, 5, 6, 7, -1, -1, -1, -1});
27909 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
27910 {0, 1, 1, 1, -1, -1, -1, -1});
27911 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
27912 {2, 3, 3, 3, -1, -1, -1, -1});
27913 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
27914 {0, 1, 1, 1, -1, -1, -1, -1});
27915 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
27916 {2, 3, 3, 3, -1, -1, -1, -1});
27917 }
27918 }
27919
27920 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
27921 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
27922 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
27923 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
27924 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
27925
27926 // Merge the shifted lane results optimally with/without PBLENDW.
27927 // TODO - ideally shuffle combining would handle this.
27928 if (Subtarget.hasSSE41()) {
27929 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
27930 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
27931 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
27932 }
27933 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
27934 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
27935 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
27936 }
27937
27938 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
27939 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
27940 // make the existing SSE solution better.
27941 // NOTE: We honor prefered vector width before promoting to 512-bits.
27942 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
27943 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
27944 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
27945 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
27946 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
27947 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8
) && "Unexpected vector type") ? static_cast<void>
(0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27948, __PRETTY_FUNCTION__))
27948 "Unexpected vector type")(((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8
) && "Unexpected vector type") ? static_cast<void>
(0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27948, __PRETTY_FUNCTION__))
;
27949 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
27950 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
27951 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27952 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
27953 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
27954 return DAG.getNode(ISD::TRUNCATE, dl, VT,
27955 DAG.getNode(Opc, dl, ExtVT, R, Amt));
27956 }
27957
27958 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
27959 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
27960 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
27961 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
27962 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
27963 !Subtarget.hasXOP()) {
27964 int NumElts = VT.getVectorNumElements();
27965 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
27966
27967 // Extend constant shift amount to vXi16 (it doesn't matter if the type
27968 // isn't legal).
27969 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27970 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
27971 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
27972 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
27973 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected") ? static_cast<void> (
0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27974, __PRETTY_FUNCTION__))
27974 "Constant build vector expected")((ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected") ? static_cast<void> (
0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27974, __PRETTY_FUNCTION__))
;
27975
27976 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
27977 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
27978 : DAG.getZExtOrTrunc(R, dl, ExVT);
27979 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
27980 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
27981 return DAG.getZExtOrTrunc(R, dl, VT);
27982 }
27983
27984 SmallVector<SDValue, 16> LoAmt, HiAmt;
27985 for (int i = 0; i != NumElts; i += 16) {
27986 for (int j = 0; j != 8; ++j) {
27987 LoAmt.push_back(Amt.getOperand(i + j));
27988 HiAmt.push_back(Amt.getOperand(i + j + 8));
27989 }
27990 }
27991
27992 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
27993 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
27994 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
27995
27996 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
27997 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
27998 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
27999 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
28000 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
28001 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
28002 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
28003 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
28004 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
28005 }
28006
28007 if (VT == MVT::v16i8 ||
28008 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
28009 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
28010 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
28011
28012 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
28013 if (VT.is512BitVector()) {
28014 // On AVX512BW targets we make use of the fact that VSELECT lowers
28015 // to a masked blend which selects bytes based just on the sign bit
28016 // extracted to a mask.
28017 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
28018 V0 = DAG.getBitcast(VT, V0);
28019 V1 = DAG.getBitcast(VT, V1);
28020 Sel = DAG.getBitcast(VT, Sel);
28021 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
28022 ISD::SETGT);
28023 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
28024 } else if (Subtarget.hasSSE41()) {
28025 // On SSE41 targets we can use PBLENDVB which selects bytes based just
28026 // on the sign bit.
28027 V0 = DAG.getBitcast(VT, V0);
28028 V1 = DAG.getBitcast(VT, V1);
28029 Sel = DAG.getBitcast(VT, Sel);
28030 return DAG.getBitcast(SelVT,
28031 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
28032 }
28033 // On pre-SSE41 targets we test for the sign bit by comparing to
28034 // zero - a negative value will set all bits of the lanes to true
28035 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
28036 SDValue Z = DAG.getConstant(0, dl, SelVT);
28037 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
28038 return DAG.getSelect(dl, SelVT, C, V0, V1);
28039 };
28040
28041 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
28042 // We can safely do this using i16 shifts as we're only interested in
28043 // the 3 lower bits of each byte.
28044 Amt = DAG.getBitcast(ExtVT, Amt);
28045 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
28046 Amt = DAG.getBitcast(VT, Amt);
28047
28048 if (Opc == ISD::SHL || Opc == ISD::SRL) {
28049 // r = VSELECT(r, shift(r, 4), a);
28050 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
28051 R = SignBitSelect(VT, Amt, M, R);
28052
28053 // a += a
28054 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28055
28056 // r = VSELECT(r, shift(r, 2), a);
28057 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
28058 R = SignBitSelect(VT, Amt, M, R);
28059
28060 // a += a
28061 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28062
28063 // return VSELECT(r, shift(r, 1), a);
28064 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
28065 R = SignBitSelect(VT, Amt, M, R);
28066 return R;
28067 }
28068
28069 if (Opc == ISD::SRA) {
28070 // For SRA we need to unpack each byte to the higher byte of a i16 vector
28071 // so we can correctly sign extend. We don't care what happens to the
28072 // lower byte.
28073 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28074 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28075 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
28076 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
28077 ALo = DAG.getBitcast(ExtVT, ALo);
28078 AHi = DAG.getBitcast(ExtVT, AHi);
28079 RLo = DAG.getBitcast(ExtVT, RLo);
28080 RHi = DAG.getBitcast(ExtVT, RHi);
28081
28082 // r = VSELECT(r, shift(r, 4), a);
28083 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
28084 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
28085 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28086 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28087
28088 // a += a
28089 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28090 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28091
28092 // r = VSELECT(r, shift(r, 2), a);
28093 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
28094 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
28095 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28096 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28097
28098 // a += a
28099 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28100 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28101
28102 // r = VSELECT(r, shift(r, 1), a);
28103 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
28104 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
28105 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28106 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28107
28108 // Logical shift the result back to the lower byte, leaving a zero upper
28109 // byte meaning that we can safely pack with PACKUSWB.
28110 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
28111 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
28112 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28113 }
28114 }
28115
28116 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
28117 MVT ExtVT = MVT::v8i32;
28118 SDValue Z = DAG.getConstant(0, dl, VT);
28119 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
28120 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
28121 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
28122 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
28123 ALo = DAG.getBitcast(ExtVT, ALo);
28124 AHi = DAG.getBitcast(ExtVT, AHi);
28125 RLo = DAG.getBitcast(ExtVT, RLo);
28126 RHi = DAG.getBitcast(ExtVT, RHi);
28127 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
28128 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
28129 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
28130 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
28131 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28132 }
28133
28134 if (VT == MVT::v8i16) {
28135 // If we have a constant shift amount, the non-SSE41 path is best as
28136 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
28137 bool UseSSE41 = Subtarget.hasSSE41() &&
28138 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28139
28140 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
28141 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
28142 // the sign bit.
28143 if (UseSSE41) {
28144 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
28145 V0 = DAG.getBitcast(ExtVT, V0);
28146 V1 = DAG.getBitcast(ExtVT, V1);
28147 Sel = DAG.getBitcast(ExtVT, Sel);
28148 return DAG.getBitcast(
28149 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
28150 }
28151 // On pre-SSE41 targets we splat the sign bit - a negative value will
28152 // set all bits of the lanes to true and VSELECT uses that in
28153 // its OR(AND(V0,C),AND(V1,~C)) lowering.
28154 SDValue C =
28155 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
28156 return DAG.getSelect(dl, VT, C, V0, V1);
28157 };
28158
28159 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
28160 if (UseSSE41) {
28161 // On SSE41 targets we need to replicate the shift mask in both
28162 // bytes for PBLENDVB.
28163 Amt = DAG.getNode(
28164 ISD::OR, dl, VT,
28165 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
28166 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
28167 } else {
28168 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
28169 }
28170
28171 // r = VSELECT(r, shift(r, 8), a);
28172 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
28173 R = SignBitSelect(Amt, M, R);
28174
28175 // a += a
28176 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28177
28178 // r = VSELECT(r, shift(r, 4), a);
28179 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
28180 R = SignBitSelect(Amt, M, R);
28181
28182 // a += a
28183 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28184
28185 // r = VSELECT(r, shift(r, 2), a);
28186 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
28187 R = SignBitSelect(Amt, M, R);
28188
28189 // a += a
28190 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28191
28192 // return VSELECT(r, shift(r, 1), a);
28193 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
28194 R = SignBitSelect(Amt, M, R);
28195 return R;
28196 }
28197
28198 // Decompose 256-bit shifts into 128-bit shifts.
28199 if (VT.is256BitVector())
28200 return splitVectorIntBinary(Op, DAG);
28201
28202 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28203 return splitVectorIntBinary(Op, DAG);
28204
28205 return SDValue();
28206}
28207
28208static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
28209 SelectionDAG &DAG) {
28210 MVT VT = Op.getSimpleValueType();
28211 assert(VT.isVector() && "Custom lowering only for vector rotates!")((VT.isVector() && "Custom lowering only for vector rotates!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28211, __PRETTY_FUNCTION__))
;
28212
28213 SDLoc DL(Op);
28214 SDValue R = Op.getOperand(0);
28215 SDValue Amt = Op.getOperand(1);
28216 unsigned Opcode = Op.getOpcode();
28217 unsigned EltSizeInBits = VT.getScalarSizeInBits();
28218 int NumElts = VT.getVectorNumElements();
28219
28220 // Check for constant splat rotation amount.
28221 APInt CstSplatValue;
28222 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
28223
28224 // Check for splat rotate by zero.
28225 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
28226 return R;
28227
28228 // AVX512 implicitly uses modulo rotation amounts.
28229 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
28230 // Attempt to rotate by immediate.
28231 if (IsCstSplat) {
28232 unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
28233 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28234 return DAG.getNode(RotOpc, DL, VT, R,
28235 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28236 }
28237
28238 // Else, fall-back on VPROLV/VPRORV.
28239 return Op;
28240 }
28241
28242 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
28243 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
28244 unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
28245 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
28246 }
28247
28248 assert((Opcode == ISD::ROTL) && "Only ROTL supported")(((Opcode == ISD::ROTL) && "Only ROTL supported") ? static_cast
<void> (0) : __assert_fail ("(Opcode == ISD::ROTL) && \"Only ROTL supported\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28248, __PRETTY_FUNCTION__))
;
28249
28250 // XOP has 128-bit vector variable + immediate rotates.
28251 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
28252 // XOP implicitly uses modulo rotation amounts.
28253 if (Subtarget.hasXOP()) {
28254 if (VT.is256BitVector())
28255 return splitVectorIntBinary(Op, DAG);
28256 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")((VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28256, __PRETTY_FUNCTION__))
;
28257
28258 // Attempt to rotate by immediate.
28259 if (IsCstSplat) {
28260 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28261 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
28262 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28263 }
28264
28265 // Use general rotate by variable (per-element).
28266 return Op;
28267 }
28268
28269 // Split 256-bit integers on pre-AVX2 targets.
28270 if (VT.is256BitVector() && !Subtarget.hasAVX2())
28271 return splitVectorIntBinary(Op, DAG);
28272
28273 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
VT == MVT::v32i16) && Subtarget.hasAVX2())) &&
"Only vXi32/vXi16/vXi8 vector rotates supported") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28277, __PRETTY_FUNCTION__))
28274 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
VT == MVT::v32i16) && Subtarget.hasAVX2())) &&
"Only vXi32/vXi16/vXi8 vector rotates supported") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28277, __PRETTY_FUNCTION__))
28275 VT == MVT::v32i16) &&(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
VT == MVT::v32i16) && Subtarget.hasAVX2())) &&
"Only vXi32/vXi16/vXi8 vector rotates supported") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28277, __PRETTY_FUNCTION__))
28276 Subtarget.hasAVX2())) &&(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
VT == MVT::v32i16) && Subtarget.hasAVX2())) &&
"Only vXi32/vXi16/vXi8 vector rotates supported") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28277, __PRETTY_FUNCTION__))
28277 "Only vXi32/vXi16/vXi8 vector rotates supported")(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
VT == MVT::v32i16) && Subtarget.hasAVX2())) &&
"Only vXi32/vXi16/vXi8 vector rotates supported") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28277, __PRETTY_FUNCTION__))
;
28278
28279 // Rotate by an uniform constant - expand back to shifts.
28280 if (IsCstSplat)
28281 return SDValue();
28282
28283 bool IsSplatAmt = DAG.isSplatValue(Amt);
28284
28285 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
28286 // the amount bit.
28287 if (EltSizeInBits == 8 && !IsSplatAmt) {
28288 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
28289 return SDValue();
28290
28291 // We don't need ModuloAmt here as we just peek at individual bits.
28292 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28293
28294 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
28295 if (Subtarget.hasSSE41()) {
28296 // On SSE41 targets we can use PBLENDVB which selects bytes based just
28297 // on the sign bit.
28298 V0 = DAG.getBitcast(VT, V0);
28299 V1 = DAG.getBitcast(VT, V1);
28300 Sel = DAG.getBitcast(VT, Sel);
28301 return DAG.getBitcast(SelVT,
28302 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
28303 }
28304 // On pre-SSE41 targets we test for the sign bit by comparing to
28305 // zero - a negative value will set all bits of the lanes to true
28306 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
28307 SDValue Z = DAG.getConstant(0, DL, SelVT);
28308 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
28309 return DAG.getSelect(DL, SelVT, C, V0, V1);
28310 };
28311
28312 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
28313 // We can safely do this using i16 shifts as we're only interested in
28314 // the 3 lower bits of each byte.
28315 Amt = DAG.getBitcast(ExtVT, Amt);
28316 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
28317 Amt = DAG.getBitcast(VT, Amt);
28318
28319 // r = VSELECT(r, rot(r, 4), a);
28320 SDValue M;
28321 M = DAG.getNode(
28322 ISD::OR, DL, VT,
28323 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
28324 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
28325 R = SignBitSelect(VT, Amt, M, R);
28326
28327 // a += a
28328 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
28329
28330 // r = VSELECT(r, rot(r, 2), a);
28331 M = DAG.getNode(
28332 ISD::OR, DL, VT,
28333 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
28334 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
28335 R = SignBitSelect(VT, Amt, M, R);
28336
28337 // a += a
28338 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
28339
28340 // return VSELECT(r, rot(r, 1), a);
28341 M = DAG.getNode(
28342 ISD::OR, DL, VT,
28343 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
28344 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
28345 return SignBitSelect(VT, Amt, M, R);
28346 }
28347
28348 // ISD::ROT* uses modulo rotate amounts.
28349 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
28350 DAG.getConstant(EltSizeInBits - 1, DL, VT));
28351
28352 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28353 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
28354 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
28355
28356 // Fallback for splats + all supported variable shifts.
28357 // Fallback for non-constants AVX2 vXi16 as well.
28358 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
28359 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
28360 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
28361 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
28362 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
28363 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
28364 }
28365
28366 // As with shifts, convert the rotation amount to a multiplication factor.
28367 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
28368 assert(Scale && "Failed to convert ROTL amount to scale")((Scale && "Failed to convert ROTL amount to scale") ?
static_cast<void> (0) : __assert_fail ("Scale && \"Failed to convert ROTL amount to scale\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28368, __PRETTY_FUNCTION__))
;
28369
28370 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
28371 if (EltSizeInBits == 16) {
28372 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
28373 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
28374 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
28375 }
28376
28377 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
28378 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
28379 // that can then be OR'd with the lower 32-bits.
28380 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")((VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28380, __PRETTY_FUNCTION__))
;
28381 static const int OddMask[] = {1, -1, 3, -1};
28382 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
28383 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
28384
28385 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
28386 DAG.getBitcast(MVT::v2i64, R),
28387 DAG.getBitcast(MVT::v2i64, Scale));
28388 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
28389 DAG.getBitcast(MVT::v2i64, R13),
28390 DAG.getBitcast(MVT::v2i64, Scale13));
28391 Res02 = DAG.getBitcast(VT, Res02);
28392 Res13 = DAG.getBitcast(VT, Res13);
28393
28394 return DAG.getNode(ISD::OR, DL, VT,
28395 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
28396 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
28397}
28398
28399/// Returns true if the operand type is exactly twice the native width, and
28400/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
28401/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
28402/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
28403bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
28404 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
28405
28406 if (OpWidth == 64)
28407 return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
28408 if (OpWidth == 128)
28409 return Subtarget.hasCmpxchg16b();
28410
28411 return false;
28412}
28413
28414bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
28415 Type *MemType = SI->getValueOperand()->getType();
28416
28417 bool NoImplicitFloatOps =
28418 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
28419 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
28420 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
28421 (Subtarget.hasSSE1() || Subtarget.hasX87()))
28422 return false;
28423
28424 return needsCmpXchgNb(MemType);
28425}
28426
28427// Note: this turns large loads into lock cmpxchg8b/16b.
28428// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
28429TargetLowering::AtomicExpansionKind
28430X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
28431 Type *MemType = LI->getType();
28432
28433 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
28434 // can use movq to do the load. If we have X87 we can load into an 80-bit
28435 // X87 register and store it to a stack temporary.
28436 bool NoImplicitFloatOps =
28437 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
28438 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
28439 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
28440 (Subtarget.hasSSE1() || Subtarget.hasX87()))
28441 return AtomicExpansionKind::None;
28442
28443 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
28444 : AtomicExpansionKind::None;
28445}
28446
28447TargetLowering::AtomicExpansionKind
28448X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
28449 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
28450 Type *MemType = AI->getType();
28451
28452 // If the operand is too big, we must see if cmpxchg8/16b is available
28453 // and default to library calls otherwise.
28454 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
28455 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
28456 : AtomicExpansionKind::None;
28457 }
28458
28459 AtomicRMWInst::BinOp Op = AI->getOperation();
28460 switch (Op) {
28461 default:
28462 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28462)
;
28463 case AtomicRMWInst::Xchg:
28464 case AtomicRMWInst::Add:
28465 case AtomicRMWInst::Sub:
28466 // It's better to use xadd, xsub or xchg for these in all cases.
28467 return AtomicExpansionKind::None;
28468 case AtomicRMWInst::Or:
28469 case AtomicRMWInst::And:
28470 case AtomicRMWInst::Xor:
28471 // If the atomicrmw's result isn't actually used, we can just add a "lock"
28472 // prefix to a normal instruction for these operations.
28473 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
28474 : AtomicExpansionKind::None;
28475 case AtomicRMWInst::Nand:
28476 case AtomicRMWInst::Max:
28477 case AtomicRMWInst::Min:
28478 case AtomicRMWInst::UMax:
28479 case AtomicRMWInst::UMin:
28480 case AtomicRMWInst::FAdd:
28481 case AtomicRMWInst::FSub:
28482 // These always require a non-trivial set of data operations on x86. We must
28483 // use a cmpxchg loop.
28484 return AtomicExpansionKind::CmpXChg;
28485 }
28486}
28487
28488LoadInst *
28489X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
28490 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
28491 Type *MemType = AI->getType();
28492 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
28493 // there is no benefit in turning such RMWs into loads, and it is actually
28494 // harmful as it introduces a mfence.
28495 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
28496 return nullptr;
28497
28498 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
28499 // lowering available in lowerAtomicArith.
28500 // TODO: push more cases through this path.
28501 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
28502 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
28503 AI->use_empty())
28504 return nullptr;
28505
28506 IRBuilder<> Builder(AI);
28507 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
28508 auto SSID = AI->getSyncScopeID();
28509 // We must restrict the ordering to avoid generating loads with Release or
28510 // ReleaseAcquire orderings.
28511 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
28512
28513 // Before the load we need a fence. Here is an example lifted from
28514 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
28515 // is required:
28516 // Thread 0:
28517 // x.store(1, relaxed);
28518 // r1 = y.fetch_add(0, release);
28519 // Thread 1:
28520 // y.fetch_add(42, acquire);
28521 // r2 = x.load(relaxed);
28522 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
28523 // lowered to just a load without a fence. A mfence flushes the store buffer,
28524 // making the optimization clearly correct.
28525 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
28526 // otherwise, we might be able to be more aggressive on relaxed idempotent
28527 // rmw. In practice, they do not look useful, so we don't try to be
28528 // especially clever.
28529 if (SSID == SyncScope::SingleThread)
28530 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
28531 // the IR level, so we must wrap it in an intrinsic.
28532 return nullptr;
28533
28534 if (!Subtarget.hasMFence())
28535 // FIXME: it might make sense to use a locked operation here but on a
28536 // different cache-line to prevent cache-line bouncing. In practice it
28537 // is probably a small win, and x86 processors without mfence are rare
28538 // enough that we do not bother.
28539 return nullptr;
28540
28541 Function *MFence =
28542 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
28543 Builder.CreateCall(MFence, {});
28544
28545 // Finally we can emit the atomic load.
28546 LoadInst *Loaded =
28547 Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
28548 Align(AI->getType()->getPrimitiveSizeInBits()));
28549 Loaded->setAtomic(Order, SSID);
28550 AI->replaceAllUsesWith(Loaded);
28551 AI->eraseFromParent();
28552 return Loaded;
28553}
28554
28555bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
28556 if (!SI.isUnordered())
28557 return false;
28558 return ExperimentalUnorderedISEL;
28559}
28560bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
28561 if (!LI.isUnordered())
28562 return false;
28563 return ExperimentalUnorderedISEL;
28564}
28565
28566
28567/// Emit a locked operation on a stack location which does not change any
28568/// memory location, but does involve a lock prefix. Location is chosen to be
28569/// a) very likely accessed only by a single thread to minimize cache traffic,
28570/// and b) definitely dereferenceable. Returns the new Chain result.
28571static SDValue emitLockedStackOp(SelectionDAG &DAG,
28572 const X86Subtarget &Subtarget, SDValue Chain,
28573 const SDLoc &DL) {
28574 // Implementation notes:
28575 // 1) LOCK prefix creates a full read/write reordering barrier for memory
28576 // operations issued by the current processor. As such, the location
28577 // referenced is not relevant for the ordering properties of the instruction.
28578 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
28579 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
28580 // 2) Using an immediate operand appears to be the best encoding choice
28581 // here since it doesn't require an extra register.
28582 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
28583 // is small enough it might just be measurement noise.)
28584 // 4) When choosing offsets, there are several contributing factors:
28585 // a) If there's no redzone, we default to TOS. (We could allocate a cache
28586 // line aligned stack object to improve this case.)
28587 // b) To minimize our chances of introducing a false dependence, we prefer
28588 // to offset the stack usage from TOS slightly.
28589 // c) To minimize concerns about cross thread stack usage - in particular,
28590 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
28591 // captures state in the TOS frame and accesses it from many threads -
28592 // we want to use an offset such that the offset is in a distinct cache
28593 // line from the TOS frame.
28594 //
28595 // For a general discussion of the tradeoffs and benchmark results, see:
28596 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
28597
28598 auto &MF = DAG.getMachineFunction();
28599 auto &TFL = *Subtarget.getFrameLowering();
28600 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
28601
28602 if (Subtarget.is64Bit()) {
28603 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
28604 SDValue Ops[] = {
28605 DAG.getRegister(X86::RSP, MVT::i64), // Base
28606 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
28607 DAG.getRegister(0, MVT::i64), // Index
28608 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
28609 DAG.getRegister(0, MVT::i16), // Segment.
28610 Zero,
28611 Chain};
28612 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
28613 MVT::Other, Ops);
28614 return SDValue(Res, 1);
28615 }
28616
28617 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
28618 SDValue Ops[] = {
28619 DAG.getRegister(X86::ESP, MVT::i32), // Base
28620 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
28621 DAG.getRegister(0, MVT::i32), // Index
28622 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
28623 DAG.getRegister(0, MVT::i16), // Segment.
28624 Zero,
28625 Chain
28626 };
28627 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
28628 MVT::Other, Ops);
28629 return SDValue(Res, 1);
28630}
28631
28632static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
28633 SelectionDAG &DAG) {
28634 SDLoc dl(Op);
28635 AtomicOrdering FenceOrdering =
28636 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
28637 SyncScope::ID FenceSSID =
28638 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
28639
28640 // The only fence that needs an instruction is a sequentially-consistent
28641 // cross-thread fence.
28642 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
28643 FenceSSID == SyncScope::System) {
28644 if (Subtarget.hasMFence())
28645 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
28646
28647 SDValue Chain = Op.getOperand(0);
28648 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
28649 }
28650
28651 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
28652 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
28653}
28654
28655static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
28656 SelectionDAG &DAG) {
28657 MVT T = Op.getSimpleValueType();
28658 SDLoc DL(Op);
28659 unsigned Reg = 0;
28660 unsigned size = 0;
28661 switch(T.SimpleTy) {
28662 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28662)
;
28663 case MVT::i8: Reg = X86::AL; size = 1; break;
28664 case MVT::i16: Reg = X86::AX; size = 2; break;
28665 case MVT::i32: Reg = X86::EAX; size = 4; break;
28666 case MVT::i64:
28667 assert(Subtarget.is64Bit() && "Node not type legal!")((Subtarget.is64Bit() && "Node not type legal!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28667, __PRETTY_FUNCTION__))
;
28668 Reg = X86::RAX; size = 8;
28669 break;
28670 }
28671 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
28672 Op.getOperand(2), SDValue());
28673 SDValue Ops[] = { cpIn.getValue(0),
28674 Op.getOperand(1),
28675 Op.getOperand(3),
28676 DAG.getTargetConstant(size, DL, MVT::i8),
28677 cpIn.getValue(1) };
28678 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
28679 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
28680 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
28681 Ops, T, MMO);
28682
28683 SDValue cpOut =
28684 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
28685 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
28686 MVT::i32, cpOut.getValue(2));
28687 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
28688
28689 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
28690 cpOut, Success, EFLAGS.getValue(1));
28691}
28692
28693// Create MOVMSKB, taking into account whether we need to split for AVX1.
28694static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
28695 const X86Subtarget &Subtarget) {
28696 MVT InVT = V.getSimpleValueType();
28697
28698 if (InVT == MVT::v64i8) {
28699 SDValue Lo, Hi;
28700 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
28701 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
28702 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
28703 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
28704 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
28705 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
28706 DAG.getConstant(32, DL, MVT::i8));
28707 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
28708 }
28709 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
28710 SDValue Lo, Hi;
28711 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
28712 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
28713 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
28714 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
28715 DAG.getConstant(16, DL, MVT::i8));
28716 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
28717 }
28718
28719 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
28720}
28721
28722static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
28723 SelectionDAG &DAG) {
28724 SDValue Src = Op.getOperand(0);
28725 MVT SrcVT = Src.getSimpleValueType();
28726 MVT DstVT = Op.getSimpleValueType();
28727
28728 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
28729 // half to v32i1 and concatenating the result.
28730 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
28731 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((!Subtarget.is64Bit() && "Expected 32-bit mode") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28731, __PRETTY_FUNCTION__))
;
28732 assert(Subtarget.hasBWI() && "Expected BWI target")((Subtarget.hasBWI() && "Expected BWI target") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28732, __PRETTY_FUNCTION__))
;
28733 SDLoc dl(Op);
28734 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
28735 DAG.getIntPtrConstant(0, dl));
28736 Lo = DAG.getBitcast(MVT::v32i1, Lo);
28737 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
28738 DAG.getIntPtrConstant(1, dl));
28739 Hi = DAG.getBitcast(MVT::v32i1, Hi);
28740 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
28741 }
28742
28743 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
28744 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
28745 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")((!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28745, __PRETTY_FUNCTION__))
;
28746 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
28747 SDLoc DL(Op);
28748 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
28749 V = getPMOVMSKB(DL, V, DAG, Subtarget);
28750 return DAG.getZExtOrTrunc(V, DL, DstVT);
28751 }
28752
28753 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT
::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28754, __PRETTY_FUNCTION__))
28754 SrcVT == MVT::i64) && "Unexpected VT!")(((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT
::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28754, __PRETTY_FUNCTION__))
;
28755
28756 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28756, __PRETTY_FUNCTION__))
;
28757 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
28758 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
28759 // This conversion needs to be expanded.
28760 return SDValue();
28761
28762 SDLoc dl(Op);
28763 if (SrcVT.isVector()) {
28764 // Widen the vector in input in the case of MVT::v2i32.
28765 // Example: from MVT::v2i32 to MVT::v4i32.
28766 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
28767 SrcVT.getVectorNumElements() * 2);
28768 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
28769 DAG.getUNDEF(SrcVT));
28770 } else {
28771 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&((SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST") ? static_cast<void
> (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28772, __PRETTY_FUNCTION__))
28772 "Unexpected source type in LowerBITCAST")((SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST") ? static_cast<void
> (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28772, __PRETTY_FUNCTION__))
;
28773 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
28774 }
28775
28776 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
28777 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
28778
28779 if (DstVT == MVT::x86mmx)
28780 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
28781
28782 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
28783 DAG.getIntPtrConstant(0, dl));
28784}
28785
28786/// Compute the horizontal sum of bytes in V for the elements of VT.
28787///
28788/// Requires V to be a byte vector and VT to be an integer vector type with
28789/// wider elements than V's type. The width of the elements of VT determines
28790/// how many bytes of V are summed horizontally to produce each element of the
28791/// result.
28792static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
28793 const X86Subtarget &Subtarget,
28794 SelectionDAG &DAG) {
28795 SDLoc DL(V);
28796 MVT ByteVecVT = V.getSimpleValueType();
28797 MVT EltVT = VT.getVectorElementType();
28798 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28799, __PRETTY_FUNCTION__))
28799 "Expected value to have byte element type.")((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28799, __PRETTY_FUNCTION__))
;
28800 assert(EltVT != MVT::i8 &&((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28801, __PRETTY_FUNCTION__))
28801 "Horizontal byte sum only makes sense for wider elements!")((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28801, __PRETTY_FUNCTION__))
;
28802 unsigned VecSize = VT.getSizeInBits();
28803 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")((ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28803, __PRETTY_FUNCTION__))
;
28804
28805 // PSADBW instruction horizontally add all bytes and leave the result in i64
28806 // chunks, thus directly computes the pop count for v2i64 and v4i64.
28807 if (EltVT == MVT::i64) {
28808 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
28809 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
28810 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
28811 return DAG.getBitcast(VT, V);
28812 }
28813
28814 if (EltVT == MVT::i32) {
28815 // We unpack the low half and high half into i32s interleaved with zeros so
28816 // that we can use PSADBW to horizontally sum them. The most useful part of
28817 // this is that it lines up the results of two PSADBW instructions to be
28818 // two v2i64 vectors which concatenated are the 4 population counts. We can
28819 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
28820 SDValue Zeros = DAG.getConstant(0, DL, VT);
28821 SDValue V32 = DAG.getBitcast(VT, V);
28822 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
28823 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
28824
28825 // Do the horizontal sums into two v2i64s.
28826 Zeros = DAG.getConstant(0, DL, ByteVecVT);
28827 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
28828 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
28829 DAG.getBitcast(ByteVecVT, Low), Zeros);
28830 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
28831 DAG.getBitcast(ByteVecVT, High), Zeros);
28832
28833 // Merge them together.
28834 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
28835 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
28836 DAG.getBitcast(ShortVecVT, Low),
28837 DAG.getBitcast(ShortVecVT, High));
28838
28839 return DAG.getBitcast(VT, V);
28840 }
28841
28842 // The only element type left is i16.
28843 assert(EltVT == MVT::i16 && "Unknown how to handle type")((EltVT == MVT::i16 && "Unknown how to handle type") ?
static_cast<void> (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28843, __PRETTY_FUNCTION__))
;
28844
28845 // To obtain pop count for each i16 element starting from the pop count for
28846 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
28847 // right by 8. It is important to shift as i16s as i8 vector shift isn't
28848 // directly supported.
28849 SDValue ShifterV = DAG.getConstant(8, DL, VT);
28850 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
28851 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
28852 DAG.getBitcast(ByteVecVT, V));
28853 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
28854}
28855
28856static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
28857 const X86Subtarget &Subtarget,
28858 SelectionDAG &DAG) {
28859 MVT VT = Op.getSimpleValueType();
28860 MVT EltVT = VT.getVectorElementType();
28861 int NumElts = VT.getVectorNumElements();
28862 (void)EltVT;
28863 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")((EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? static_cast<void> (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28863, __PRETTY_FUNCTION__))
;
28864
28865 // Implement a lookup table in register by using an algorithm based on:
28866 // http://wm.ite.pl/articles/sse-popcount.html
28867 //
28868 // The general idea is that every lower byte nibble in the input vector is an
28869 // index into a in-register pre-computed pop count table. We then split up the
28870 // input vector in two new ones: (1) a vector with only the shifted-right
28871 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
28872 // masked out higher ones) for each byte. PSHUFB is used separately with both
28873 // to index the in-register table. Next, both are added and the result is a
28874 // i8 vector where each element contains the pop count for input byte.
28875 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
28876 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
28877 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
28878 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
28879
28880 SmallVector<SDValue, 64> LUTVec;
28881 for (int i = 0; i < NumElts; ++i)
28882 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28883 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
28884 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
28885
28886 // High nibbles
28887 SDValue FourV = DAG.getConstant(4, DL, VT);
28888 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
28889
28890 // Low nibbles
28891 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
28892
28893 // The input vector is used as the shuffle mask that index elements into the
28894 // LUT. After counting low and high nibbles, add the vector to obtain the
28895 // final pop count per i8 element.
28896 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
28897 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
28898 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
28899}
28900
28901// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
28902// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
28903static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
28904 SelectionDAG &DAG) {
28905 MVT VT = Op.getSimpleValueType();
28906 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector
()) && "Unknown CTPOP type to handle") ? static_cast<
void> (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28907, __PRETTY_FUNCTION__))
28907 "Unknown CTPOP type to handle")(((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector
()) && "Unknown CTPOP type to handle") ? static_cast<
void> (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28907, __PRETTY_FUNCTION__))
;
28908 SDLoc DL(Op.getNode());
28909 SDValue Op0 = Op.getOperand(0);
28910
28911 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
28912 if (Subtarget.hasVPOPCNTDQ()) {
28913 unsigned NumElems = VT.getVectorNumElements();
28914 assert((VT.getVectorElementType() == MVT::i8 ||(((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType
() == MVT::i16) && "Unexpected type") ? static_cast<
void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28915, __PRETTY_FUNCTION__))
28915 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType
() == MVT::i16) && "Unexpected type") ? static_cast<
void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28915, __PRETTY_FUNCTION__))
;
28916 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
28917 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28918 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
28919 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
28920 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
28921 }
28922 }
28923
28924 // Decompose 256-bit ops into smaller 128-bit ops.
28925 if (VT.is256BitVector() && !Subtarget.hasInt256())
28926 return splitVectorIntUnary(Op, DAG);
28927
28928 // Decompose 512-bit ops into smaller 256-bit ops.
28929 if (VT.is512BitVector() && !Subtarget.hasBWI())
28930 return splitVectorIntUnary(Op, DAG);
28931
28932 // For element types greater than i8, do vXi8 pop counts and a bytesum.
28933 if (VT.getScalarType() != MVT::i8) {
28934 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
28935 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
28936 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
28937 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
28938 }
28939
28940 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
28941 if (!Subtarget.hasSSSE3())
28942 return SDValue();
28943
28944 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
28945}
28946
28947static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
28948 SelectionDAG &DAG) {
28949 assert(Op.getSimpleValueType().isVector() &&((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28950, __PRETTY_FUNCTION__))
28950 "We only do custom lowering for vector population count.")((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28950, __PRETTY_FUNCTION__))
;
28951 return LowerVectorCTPOP(Op, Subtarget, DAG);
28952}
28953
28954static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
28955 MVT VT = Op.getSimpleValueType();
28956 SDValue In = Op.getOperand(0);
28957 SDLoc DL(Op);
28958
28959 // For scalars, its still beneficial to transfer to/from the SIMD unit to
28960 // perform the BITREVERSE.
28961 if (!VT.isVector()) {
28962 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
28963 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
28964 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
28965 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
28966 DAG.getIntPtrConstant(0, DL));
28967 }
28968
28969 int NumElts = VT.getVectorNumElements();
28970 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
28971
28972 // Decompose 256-bit ops into smaller 128-bit ops.
28973 if (VT.is256BitVector())
28974 return splitVectorIntUnary(Op, DAG);
28975
28976 assert(VT.is128BitVector() &&((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28977, __PRETTY_FUNCTION__))
28977 "Only 128-bit vector bitreverse lowering supported.")((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28977, __PRETTY_FUNCTION__))
;
28978
28979 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
28980 // perform the BSWAP in the shuffle.
28981 // Its best to shuffle using the second operand as this will implicitly allow
28982 // memory folding for multiple vectors.
28983 SmallVector<SDValue, 16> MaskElts;
28984 for (int i = 0; i != NumElts; ++i) {
28985 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
28986 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
28987 int PermuteByte = SourceByte | (2 << 5);
28988 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
28989 }
28990 }
28991
28992 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
28993 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
28994 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
28995 Res, Mask);
28996 return DAG.getBitcast(VT, Res);
28997}
28998
28999static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
29000 SelectionDAG &DAG) {
29001 MVT VT = Op.getSimpleValueType();
29002
29003 if (Subtarget.hasXOP() && !VT.is512BitVector())
29004 return LowerBITREVERSE_XOP(Op, DAG);
29005
29006 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")((Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29006, __PRETTY_FUNCTION__))
;
29007
29008 SDValue In = Op.getOperand(0);
29009 SDLoc DL(Op);
29010
29011 assert(VT.getScalarType() == MVT::i8 &&((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29012, __PRETTY_FUNCTION__))
29012 "Only byte vector BITREVERSE supported")((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29012, __PRETTY_FUNCTION__))
;
29013
29014 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
29015 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
29016 return splitVectorIntUnary(Op, DAG);
29017
29018 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
29019 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
29020 return splitVectorIntUnary(Op, DAG);
29021
29022 unsigned NumElts = VT.getVectorNumElements();
29023
29024 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
29025 if (Subtarget.hasGFNI()) {
29026 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29027 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
29028 Matrix = DAG.getBitcast(VT, Matrix);
29029 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
29030 DAG.getTargetConstant(0, DL, MVT::i8));
29031 }
29032
29033 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
29034 // two nibbles and a PSHUFB lookup to find the bitreverse of each
29035 // 0-15 value (moved to the other nibble).
29036 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
29037 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
29038 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
29039
29040 const int LoLUT[16] = {
29041 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
29042 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
29043 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
29044 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
29045 const int HiLUT[16] = {
29046 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
29047 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
29048 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
29049 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
29050
29051 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
29052 for (unsigned i = 0; i < NumElts; ++i) {
29053 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
29054 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
29055 }
29056
29057 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
29058 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
29059 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
29060 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
29061 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29062}
29063
29064static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
29065 SelectionDAG &DAG) {
29066 SDLoc DL(Op);
29067 SDValue X = Op.getOperand(0);
29068 MVT VT = Op.getSimpleValueType();
29069
29070 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
29071 if (VT == MVT::i8 ||
29072 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
29073 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29074 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
29075 DAG.getConstant(0, DL, MVT::i8));
29076 // Copy the inverse of the parity flag into a register with setcc.
29077 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29078 // Extend to the original type.
29079 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29080 }
29081
29082 if (VT == MVT::i64) {
29083 // Xor the high and low 16-bits together using a 32-bit operation.
29084 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
29085 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
29086 DAG.getConstant(32, DL, MVT::i8)));
29087 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
29088 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
29089 }
29090
29091 if (VT != MVT::i16) {
29092 // Xor the high and low 16-bits together using a 32-bit operation.
29093 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
29094 DAG.getConstant(16, DL, MVT::i8));
29095 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
29096 } else {
29097 // If the input is 16-bits, we need to extend to use an i32 shift below.
29098 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
29099 }
29100
29101 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
29102 // This should allow an h-reg to be used to save a shift.
29103 SDValue Hi = DAG.getNode(
29104 ISD::TRUNCATE, DL, MVT::i8,
29105 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
29106 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29107 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
29108 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
29109
29110 // Copy the inverse of the parity flag into a register with setcc.
29111 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29112 // Extend to the original type.
29113 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29114}
29115
29116static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
29117 const X86Subtarget &Subtarget) {
29118 unsigned NewOpc = 0;
29119 switch (N->getOpcode()) {
29120 case ISD::ATOMIC_LOAD_ADD:
29121 NewOpc = X86ISD::LADD;
29122 break;
29123 case ISD::ATOMIC_LOAD_SUB:
29124 NewOpc = X86ISD::LSUB;
29125 break;
29126 case ISD::ATOMIC_LOAD_OR:
29127 NewOpc = X86ISD::LOR;
29128 break;
29129 case ISD::ATOMIC_LOAD_XOR:
29130 NewOpc = X86ISD::LXOR;
29131 break;
29132 case ISD::ATOMIC_LOAD_AND:
29133 NewOpc = X86ISD::LAND;
29134 break;
29135 default:
29136 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29136)
;
29137 }
29138
29139 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
29140
29141 return DAG.getMemIntrinsicNode(
29142 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
29143 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
29144 /*MemVT=*/N->getSimpleValueType(0), MMO);
29145}
29146
29147/// Lower atomic_load_ops into LOCK-prefixed operations.
29148static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
29149 const X86Subtarget &Subtarget) {
29150 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
29151 SDValue Chain = N->getOperand(0);
29152 SDValue LHS = N->getOperand(1);
29153 SDValue RHS = N->getOperand(2);
29154 unsigned Opc = N->getOpcode();
29155 MVT VT = N->getSimpleValueType(0);
29156 SDLoc DL(N);
29157
29158 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
29159 // can only be lowered when the result is unused. They should have already
29160 // been transformed into a cmpxchg loop in AtomicExpand.
29161 if (N->hasAnyUseOfValue(0)) {
29162 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
29163 // select LXADD if LOCK_SUB can't be selected.
29164 if (Opc == ISD::ATOMIC_LOAD_SUB) {
29165 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
29166 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
29167 RHS, AN->getMemOperand());
29168 }
29169 assert(Opc == ISD::ATOMIC_LOAD_ADD &&((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29170, __PRETTY_FUNCTION__))
29170 "Used AtomicRMW ops other than Add should have been expanded!")((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29170, __PRETTY_FUNCTION__))
;
29171 return N;
29172 }
29173
29174 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
29175 // The core idea here is that since the memory location isn't actually
29176 // changing, all we need is a lowering for the *ordering* impacts of the
29177 // atomicrmw. As such, we can chose a different operation and memory
29178 // location to minimize impact on other code.
29179 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
29180 // On X86, the only ordering which actually requires an instruction is
29181 // seq_cst which isn't SingleThread, everything just needs to be preserved
29182 // during codegen and then dropped. Note that we expect (but don't assume),
29183 // that orderings other than seq_cst and acq_rel have been canonicalized to
29184 // a store or load.
29185 if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
29186 AN->getSyncScopeID() == SyncScope::System) {
29187 // Prefer a locked operation against a stack location to minimize cache
29188 // traffic. This assumes that stack locations are very likely to be
29189 // accessed only by the owning thread.
29190 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
29191 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29191, __PRETTY_FUNCTION__))
;
29192 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29193 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29194 DAG.getUNDEF(VT), NewChain);
29195 }
29196 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29197 SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
29198 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29198, __PRETTY_FUNCTION__))
;
29199 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29200 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29201 DAG.getUNDEF(VT), NewChain);
29202 }
29203
29204 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
29205 // RAUW the chain, but don't worry about the result, as it's unused.
29206 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29206, __PRETTY_FUNCTION__))
;
29207 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29208 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29209 DAG.getUNDEF(VT), LockOp.getValue(1));
29210}
29211
29212static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
29213 const X86Subtarget &Subtarget) {
29214 auto *Node = cast<AtomicSDNode>(Op.getNode());
29215 SDLoc dl(Node);
29216 EVT VT = Node->getMemoryVT();
29217
29218 bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
29219 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
29220
29221 // If this store is not sequentially consistent and the type is legal
29222 // we can just keep it.
29223 if (!IsSeqCst && IsTypeLegal)
29224 return Op;
29225
29226 if (VT == MVT::i64 && !IsTypeLegal) {
29227 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
29228 // is enabled.
29229 bool NoImplicitFloatOps =
29230 DAG.getMachineFunction().getFunction().hasFnAttribute(
29231 Attribute::NoImplicitFloat);
29232 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
29233 SDValue Chain;
29234 if (Subtarget.hasSSE1()) {
29235 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
29236 Node->getOperand(2));
29237 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
29238 SclToVec = DAG.getBitcast(StVT, SclToVec);
29239 SDVTList Tys = DAG.getVTList(MVT::Other);
29240 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
29241 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
29242 MVT::i64, Node->getMemOperand());
29243 } else if (Subtarget.hasX87()) {
29244 // First load this into an 80-bit X87 register using a stack temporary.
29245 // This will put the whole integer into the significand.
29246 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
29247 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29248 MachinePointerInfo MPI =
29249 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29250 Chain =
29251 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
29252 MPI, MaybeAlign(), MachineMemOperand::MOStore);
29253 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
29254 SDValue LdOps[] = {Chain, StackPtr};
29255 SDValue Value =
29256 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
29257 /*Align*/ None, MachineMemOperand::MOLoad);
29258 Chain = Value.getValue(1);
29259
29260 // Now use an FIST to do the atomic store.
29261 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
29262 Chain =
29263 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
29264 StoreOps, MVT::i64, Node->getMemOperand());
29265 }
29266
29267 if (Chain) {
29268 // If this is a sequentially consistent store, also emit an appropriate
29269 // barrier.
29270 if (IsSeqCst)
29271 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
29272
29273 return Chain;
29274 }
29275 }
29276 }
29277
29278 // Convert seq_cst store -> xchg
29279 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
29280 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
29281 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
29282 Node->getMemoryVT(),
29283 Node->getOperand(0),
29284 Node->getOperand(1), Node->getOperand(2),
29285 Node->getMemOperand());
29286 return Swap.getValue(1);
29287}
29288
29289static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
29290 SDNode *N = Op.getNode();
29291 MVT VT = N->getSimpleValueType(0);
29292 unsigned Opc = Op.getOpcode();
29293
29294 // Let legalize expand this if it isn't a legal type yet.
29295 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
29296 return SDValue();
29297
29298 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29299 SDLoc DL(N);
29300
29301 // Set the carry flag.
29302 SDValue Carry = Op.getOperand(2);
29303 EVT CarryVT = Carry.getValueType();
29304 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
29305 Carry, DAG.getAllOnesConstant(DL, CarryVT));
29306
29307 bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
29308 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
29309 Op.getOperand(0), Op.getOperand(1),
29310 Carry.getValue(1));
29311
29312 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
29313 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
29314 Sum.getValue(1), DL, DAG);
29315 if (N->getValueType(1) == MVT::i1)
29316 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
29317
29318 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
29319}
29320
29321static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
29322 SelectionDAG &DAG) {
29323 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())((Subtarget.isTargetDarwin() && Subtarget.is64Bit()) ?
static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29323, __PRETTY_FUNCTION__))
;
29324
29325 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
29326 // which returns the values as { float, float } (in XMM0) or
29327 // { double, double } (which is returned in XMM0, XMM1).
29328 SDLoc dl(Op);
29329 SDValue Arg = Op.getOperand(0);
29330 EVT ArgVT = Arg.getValueType();
29331 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
29332
29333 TargetLowering::ArgListTy Args;
29334 TargetLowering::ArgListEntry Entry;
29335
29336 Entry.Node = Arg;
29337 Entry.Ty = ArgTy;
29338 Entry.IsSExt = false;
29339 Entry.IsZExt = false;
29340 Args.push_back(Entry);
29341
29342 bool isF64 = ArgVT == MVT::f64;
29343 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
29344 // the small struct {f32, f32} is returned in (eax, edx). For f64,
29345 // the results are returned via SRet in memory.
29346 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29347 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
29348 const char *LibcallName = TLI.getLibcallName(LC);
29349 SDValue Callee =
29350 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
29351
29352 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
29353 : (Type *)FixedVectorType::get(ArgTy, 4);
29354
29355 TargetLowering::CallLoweringInfo CLI(DAG);
29356 CLI.setDebugLoc(dl)
29357 .setChain(DAG.getEntryNode())
29358 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
29359
29360 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
29361
29362 if (isF64)
29363 // Returned in xmm0 and xmm1.
29364 return CallResult.first;
29365
29366 // Returned in bits 0:31 and 32:64 xmm0.
29367 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
29368 CallResult.first, DAG.getIntPtrConstant(0, dl));
29369 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
29370 CallResult.first, DAG.getIntPtrConstant(1, dl));
29371 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
29372 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
29373}
29374
29375/// Widen a vector input to a vector of NVT. The
29376/// input vector must have the same element type as NVT.
29377static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
29378 bool FillWithZeroes = false) {
29379 // Check if InOp already has the right width.
29380 MVT InVT = InOp.getSimpleValueType();
29381 if (InVT == NVT)
29382 return InOp;
29383
29384 if (InOp.isUndef())
29385 return DAG.getUNDEF(NVT);
29386
29387 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&((InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match") ? static_cast<
void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29388, __PRETTY_FUNCTION__))
29388 "input and widen element type must match")((InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match") ? static_cast<
void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29388, __PRETTY_FUNCTION__))
;
29389
29390 unsigned InNumElts = InVT.getVectorNumElements();
29391 unsigned WidenNumElts = NVT.getVectorNumElements();
29392 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&((WidenNumElts > InNumElts && WidenNumElts % InNumElts
== 0 && "Unexpected request for vector widening") ? static_cast
<void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29393, __PRETTY_FUNCTION__))
29393 "Unexpected request for vector widening")((WidenNumElts > InNumElts && WidenNumElts % InNumElts
== 0 && "Unexpected request for vector widening") ? static_cast
<void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29393, __PRETTY_FUNCTION__))
;
29394
29395 SDLoc dl(InOp);
29396 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
29397 InOp.getNumOperands() == 2) {
29398 SDValue N1 = InOp.getOperand(1);
29399 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
29400 N1.isUndef()) {
29401 InOp = InOp.getOperand(0);
29402 InVT = InOp.getSimpleValueType();
29403 InNumElts = InVT.getVectorNumElements();
29404 }
29405 }
29406 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
29407 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
29408 SmallVector<SDValue, 16> Ops;
29409 for (unsigned i = 0; i < InNumElts; ++i)
29410 Ops.push_back(InOp.getOperand(i));
29411
29412 EVT EltVT = InOp.getOperand(0).getValueType();
29413
29414 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
29415 DAG.getUNDEF(EltVT);
29416 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
29417 Ops.push_back(FillVal);
29418 return DAG.getBuildVector(NVT, dl, Ops);
29419 }
29420 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
29421 DAG.getUNDEF(NVT);
29422 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
29423 InOp, DAG.getIntPtrConstant(0, dl));
29424}
29425
29426static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
29427 SelectionDAG &DAG) {
29428 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29429, __PRETTY_FUNCTION__))
29429 "MGATHER/MSCATTER are supported on AVX-512 arch only")((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29429, __PRETTY_FUNCTION__))
;
29430
29431 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
29432 SDValue Src = N->getValue();
29433 MVT VT = Src.getSimpleValueType();
29434 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")((VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29434, __PRETTY_FUNCTION__))
;
29435 SDLoc dl(Op);
29436
29437 SDValue Scale = N->getScale();
29438 SDValue Index = N->getIndex();
29439 SDValue Mask = N->getMask();
29440 SDValue Chain = N->getChain();
29441 SDValue BasePtr = N->getBasePtr();
29442
29443 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
29444 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"
) ? static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29444, __PRETTY_FUNCTION__))
;
29445 // If the index is v2i64 and we have VLX we can use xmm for data and index.
29446 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
29447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29448 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
29449 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
29450 SDVTList VTs = DAG.getVTList(MVT::Other);
29451 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
29452 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
29453 N->getMemoryVT(), N->getMemOperand());
29454 }
29455 return SDValue();
29456 }
29457
29458 MVT IndexVT = Index.getSimpleValueType();
29459
29460 // If the index is v2i32, we're being called by type legalization and we
29461 // should just let the default handling take care of it.
29462 if (IndexVT == MVT::v2i32)
29463 return SDValue();
29464
29465 // If we don't have VLX and neither the passthru or index is 512-bits, we
29466 // need to widen until one is.
29467 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
29468 !Index.getSimpleValueType().is512BitVector()) {
29469 // Determine how much we need to widen by to get a 512-bit type.
29470 unsigned Factor = std::min(512/VT.getSizeInBits(),
29471 512/IndexVT.getSizeInBits());
29472 unsigned NumElts = VT.getVectorNumElements() * Factor;
29473
29474 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
29475 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
29476 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
29477
29478 Src = ExtendToType(Src, VT, DAG);
29479 Index = ExtendToType(Index, IndexVT, DAG);
29480 Mask = ExtendToType(Mask, MaskVT, DAG, true);
29481 }
29482
29483 SDVTList VTs = DAG.getVTList(MVT::Other);
29484 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
29485 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
29486 N->getMemoryVT(), N->getMemOperand());
29487}
29488
29489static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
29490 SelectionDAG &DAG) {
29491
29492 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
29493 MVT VT = Op.getSimpleValueType();
29494 MVT ScalarVT = VT.getScalarType();
29495 SDValue Mask = N->getMask();
29496 MVT MaskVT = Mask.getSimpleValueType();
29497 SDValue PassThru = N->getPassThru();
29498 SDLoc dl(Op);
29499
29500 // Handle AVX masked loads which don't support passthru other than 0.
29501 if (MaskVT.getVectorElementType() != MVT::i1) {
29502 // We also allow undef in the isel pattern.
29503 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
29504 return Op;
29505
29506 SDValue NewLoad = DAG.getMaskedLoad(
29507 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
29508 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
29509 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
29510 N->isExpandingLoad());
29511 // Emit a blend.
29512 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
29513 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
29514 }
29515
29516 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29517, __PRETTY_FUNCTION__))
29517 "Expanding masked load is supported on AVX-512 target only!")(((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29517, __PRETTY_FUNCTION__))
;
29518
29519 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29520, __PRETTY_FUNCTION__))
29520 "Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29520, __PRETTY_FUNCTION__))
;
29521
29522 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked load op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29523, __PRETTY_FUNCTION__))
29523 "Cannot lower masked load op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked load op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29523, __PRETTY_FUNCTION__))
;
29524
29525 assert((ScalarVT.getSizeInBits() >= 32 ||(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29528, __PRETTY_FUNCTION__))
29526 (Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29528, __PRETTY_FUNCTION__))
29527 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29528, __PRETTY_FUNCTION__))
29528 "Unsupported masked load op.")(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29528, __PRETTY_FUNCTION__))
;
29529
29530 // This operation is legal for targets with VLX, but without
29531 // VLX the vector should be widened to 512 bit
29532 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
29533 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
29534 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
29535
29536 // Mask element has to be i1.
29537 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29538, __PRETTY_FUNCTION__))
29538 "Unexpected mask type")((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29538, __PRETTY_FUNCTION__))
;
29539
29540 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
29541
29542 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
29543 SDValue NewLoad = DAG.getMaskedLoad(
29544 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
29545 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
29546 N->getExtensionType(), N->isExpandingLoad());
29547
29548 SDValue Extract =
29549 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
29550 DAG.getIntPtrConstant(0, dl));
29551 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
29552 return DAG.getMergeValues(RetOps, dl);
29553}
29554
29555static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
29556 SelectionDAG &DAG) {
29557 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
29558 SDValue DataToStore = N->getValue();
29559 MVT VT = DataToStore.getSimpleValueType();
29560 MVT ScalarVT = VT.getScalarType();
29561 SDValue Mask = N->getMask();
29562 SDLoc dl(Op);
29563
29564 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29565, __PRETTY_FUNCTION__))
29565 "Expanding masked load is supported on AVX-512 target only!")(((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29565, __PRETTY_FUNCTION__))
;
29566
29567 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(((!N->isCompressingStore() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29568, __PRETTY_FUNCTION__))
29568 "Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isCompressingStore() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29568, __PRETTY_FUNCTION__))
;
29569
29570 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked store op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29571, __PRETTY_FUNCTION__))
29571 "Cannot lower masked store op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked store op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29571, __PRETTY_FUNCTION__))
;
29572
29573 assert((ScalarVT.getSizeInBits() >= 32 ||(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29576, __PRETTY_FUNCTION__))
29574 (Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29576, __PRETTY_FUNCTION__))
29575 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29576, __PRETTY_FUNCTION__))
29576 "Unsupported masked store op.")(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29576, __PRETTY_FUNCTION__))
;
29577
29578 // This operation is legal for targets with VLX, but without
29579 // VLX the vector should be widened to 512 bit
29580 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
29581 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
29582
29583 // Mask element has to be i1.
29584 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29585, __PRETTY_FUNCTION__))
29585 "Unexpected mask type")((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29585, __PRETTY_FUNCTION__))
;
29586
29587 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
29588
29589 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
29590 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
29591 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
29592 N->getOffset(), Mask, N->getMemoryVT(),
29593 N->getMemOperand(), N->getAddressingMode(),
29594 N->isTruncatingStore(), N->isCompressingStore());
29595}
29596
29597static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
29598 SelectionDAG &DAG) {
29599 assert(Subtarget.hasAVX2() &&((Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29600, __PRETTY_FUNCTION__))
29600 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")((Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29600, __PRETTY_FUNCTION__))
;
29601
29602 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
29603 SDLoc dl(Op);
29604 MVT VT = Op.getSimpleValueType();
29605 SDValue Index = N->getIndex();
29606 SDValue Mask = N->getMask();
29607 SDValue PassThru = N->getPassThru();
29608 MVT IndexVT = Index.getSimpleValueType();
29609
29610 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")((VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29610, __PRETTY_FUNCTION__))
;
29611
29612 // If the index is v2i32, we're being called by type legalization.
29613 if (IndexVT == MVT::v2i32)
29614 return SDValue();
29615
29616 // If we don't have VLX and neither the passthru or index is 512-bits, we
29617 // need to widen until one is.
29618 MVT OrigVT = VT;
29619 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
29620 !IndexVT.is512BitVector()) {
29621 // Determine how much we need to widen by to get a 512-bit type.
29622 unsigned Factor = std::min(512/VT.getSizeInBits(),
29623 512/IndexVT.getSizeInBits());
29624
29625 unsigned NumElts = VT.getVectorNumElements() * Factor;
29626
29627 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
29628 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
29629 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
29630
29631 PassThru = ExtendToType(PassThru, VT, DAG);
29632 Index = ExtendToType(Index, IndexVT, DAG);
29633 Mask = ExtendToType(Mask, MaskVT, DAG, true);
29634 }
29635
29636 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
29637 N->getScale() };
29638 SDValue NewGather = DAG.getMemIntrinsicNode(
29639 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
29640 N->getMemOperand());
29641 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
29642 NewGather, DAG.getIntPtrConstant(0, dl));
29643 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
29644}
29645
29646static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
29647 SDLoc dl(Op);
29648 SDValue Src = Op.getOperand(0);
29649 MVT DstVT = Op.getSimpleValueType();
29650
29651 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
29652 unsigned SrcAS = N->getSrcAddressSpace();
29653
29654 assert(SrcAS != N->getDestAddressSpace() &&((SrcAS != N->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29655, __PRETTY_FUNCTION__))
29655 "addrspacecast must be between different address spaces")((SrcAS != N->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29655, __PRETTY_FUNCTION__))
;
29656
29657 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
29658 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
29659 } else if (DstVT == MVT::i64) {
29660 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
29661 } else if (DstVT == MVT::i32) {
29662 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
29663 } else {
29664 report_fatal_error("Bad address space in addrspacecast");
29665 }
29666 return Op;
29667}
29668
29669SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
29670 SelectionDAG &DAG) const {
29671 // TODO: Eventually, the lowering of these nodes should be informed by or
29672 // deferred to the GC strategy for the function in which they appear. For
29673 // now, however, they must be lowered to something. Since they are logically
29674 // no-ops in the case of a null GC strategy (or a GC strategy which does not
29675 // require special handling for these nodes), lower them as literal NOOPs for
29676 // the time being.
29677 SmallVector<SDValue, 2> Ops;
29678
29679 Ops.push_back(Op.getOperand(0));
29680 if (Op->getGluedNode())
29681 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
29682
29683 SDLoc OpDL(Op);
29684 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
29685 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
29686
29687 return NOOP;
29688}
29689
29690SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
29691 RTLIB::Libcall Call) const {
29692
29693 bool IsStrict = Op->isStrictFPOpcode();
29694 unsigned Offset = IsStrict ? 1 : 0;
29695 SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
29696
29697 SDLoc dl(Op);
29698 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
29699 MakeLibCallOptions CallOptions;
29700 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
29701 CallOptions, dl, Chain);
29702
29703 if (IsStrict)
29704 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
29705
29706 return Tmp.first;
29707}
29708
29709// Custom split CVTPS2PH with wide types.
29710static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
29711 SDLoc dl(Op);
29712 EVT VT = Op.getValueType();
29713 SDValue Lo, Hi;
29714 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
29715 EVT LoVT, HiVT;
29716 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
29717 SDValue RC = Op.getOperand(1);
29718 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
29719 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
29720 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29721}
29722
29723/// Provide custom lowering hooks for some operations.
29724SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
29725 switch (Op.getOpcode()) {
29726 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29726)
;
29727 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
29728 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
29729 return LowerCMP_SWAP(Op, Subtarget, DAG);
29730 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
29731 case ISD::ATOMIC_LOAD_ADD:
29732 case ISD::ATOMIC_LOAD_SUB:
29733 case ISD::ATOMIC_LOAD_OR:
29734 case ISD::ATOMIC_LOAD_XOR:
29735 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
29736 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
29737 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
29738 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
29739 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
29740 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
29741 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
29742 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
29743 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
29744 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
29745 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
29746 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
29747 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
29748 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
29749 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
29750 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
29751 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
29752 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
29753 case ISD::SHL_PARTS:
29754 case ISD::SRA_PARTS:
29755 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
29756 case ISD::FSHL:
29757 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
29758 case ISD::STRICT_SINT_TO_FP:
29759 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
29760 case ISD::STRICT_UINT_TO_FP:
29761 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
29762 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
29763 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
29764 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
29765 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
29766 case ISD::ZERO_EXTEND_VECTOR_INREG:
29767 case ISD::SIGN_EXTEND_VECTOR_INREG:
29768 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
29769 case ISD::FP_TO_SINT:
29770 case ISD::STRICT_FP_TO_SINT:
29771 case ISD::FP_TO_UINT:
29772 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
29773 case ISD::FP_EXTEND:
29774 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
29775 case ISD::FP_ROUND:
29776 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
29777 case ISD::FP16_TO_FP:
29778 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
29779 case ISD::FP_TO_FP16:
29780 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
29781 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
29782 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
29783 case ISD::FADD:
29784 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
29785 case ISD::FROUND: return LowerFROUND(Op, DAG);
29786 case ISD::FABS:
29787 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
29788 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
29789 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
29790 case ISD::LRINT:
29791 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
29792 case ISD::SETCC:
29793 case ISD::STRICT_FSETCC:
29794 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
29795 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
29796 case ISD::SELECT: return LowerSELECT(Op, DAG);
29797 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
29798 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
29799 case ISD::VASTART: return LowerVASTART(Op, DAG);
29800 case ISD::VAARG: return LowerVAARG(Op, DAG);
29801 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
29802 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
29803 case ISD::INTRINSIC_VOID:
29804 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
29805 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
29806 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
29807 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
29808 case ISD::FRAME_TO_ARGS_OFFSET:
29809 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
29810 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
29811 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
29812 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
29813 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
29814 case ISD::EH_SJLJ_SETUP_DISPATCH:
29815 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
29816 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
29817 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
29818 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
29819 case ISD::CTLZ:
29820 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
29821 case ISD::CTTZ:
29822 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
29823 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
29824 case ISD::MULHS:
29825 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
29826 case ISD::ROTL:
29827 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
29828 case ISD::SRA:
29829 case ISD::SRL:
29830 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
29831 case ISD::SADDO:
29832 case ISD::UADDO:
29833 case ISD::SSUBO:
29834 case ISD::USUBO:
29835 case ISD::SMULO:
29836 case ISD::UMULO: return LowerXALUO(Op, DAG);
29837 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
29838 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
29839 case ISD::SADDO_CARRY:
29840 case ISD::SSUBO_CARRY:
29841 case ISD::ADDCARRY:
29842 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
29843 case ISD::ADD:
29844 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
29845 case ISD::UADDSAT:
29846 case ISD::SADDSAT:
29847 case ISD::USUBSAT:
29848 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
29849 case ISD::SMAX:
29850 case ISD::SMIN:
29851 case ISD::UMAX:
29852 case ISD::UMIN: return LowerMINMAX(Op, DAG);
29853 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
29854 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
29855 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
29856 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
29857 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
29858 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
29859 case ISD::GC_TRANSITION_START:
29860 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
29861 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
29862 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
29863 }
29864}
29865
29866/// Replace a node with an illegal result type with a new node built out of
29867/// custom code.
29868void X86TargetLowering::ReplaceNodeResults(SDNode *N,
29869 SmallVectorImpl<SDValue>&Results,
29870 SelectionDAG &DAG) const {
29871 SDLoc dl(N);
29872 switch (N->getOpcode()) {
29873 default:
29874#ifndef NDEBUG
29875 dbgs() << "ReplaceNodeResults: ";
29876 N->dump(&DAG);
29877#endif
29878 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29878)
;
29879 case X86ISD::CVTPH2PS: {
29880 EVT VT = N->getValueType(0);
29881 SDValue Lo, Hi;
29882 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
29883 EVT LoVT, HiVT;
29884 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
29885 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
29886 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
29887 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29888 Results.push_back(Res);
29889 return;
29890 }
29891 case X86ISD::STRICT_CVTPH2PS: {
29892 EVT VT = N->getValueType(0);
29893 SDValue Lo, Hi;
29894 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
29895 EVT LoVT, HiVT;
29896 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
29897 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
29898 {N->getOperand(0), Lo});
29899 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
29900 {N->getOperand(0), Hi});
29901 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
29902 Lo.getValue(1), Hi.getValue(1));
29903 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29904 Results.push_back(Res);
29905 Results.push_back(Chain);
29906 return;
29907 }
29908 case X86ISD::CVTPS2PH:
29909 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
29910 return;
29911 case ISD::CTPOP: {
29912 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((N->getValueType(0) == MVT::i64 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29912, __PRETTY_FUNCTION__))
;
29913 // Use a v2i64 if possible.
29914 bool NoImplicitFloatOps =
29915 DAG.getMachineFunction().getFunction().hasFnAttribute(
29916 Attribute::NoImplicitFloat);
29917 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
29918 SDValue Wide =
29919 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
29920 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
29921 // Bit count should fit in 32-bits, extract it as that and then zero
29922 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
29923 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
29924 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
29925 DAG.getIntPtrConstant(0, dl));
29926 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
29927 Results.push_back(Wide);
29928 }
29929 return;
29930 }
29931 case ISD::MUL: {
29932 EVT VT = N->getValueType(0);
29933 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29934, __PRETTY_FUNCTION__))
29934 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29934, __PRETTY_FUNCTION__))
;
29935 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
29936 // elements are needed.
29937 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29938 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
29939 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
29940 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
29941 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
29942 unsigned NumConcats = 16 / VT.getVectorNumElements();
29943 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
29944 ConcatOps[0] = Res;
29945 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
29946 Results.push_back(Res);
29947 return;
29948 }
29949 case X86ISD::VPMADDWD:
29950 case X86ISD::AVG: {
29951 // Legalize types for X86ISD::AVG/VPMADDWD by widening.
29952 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29952, __PRETTY_FUNCTION__))
;
29953
29954 EVT VT = N->getValueType(0);
29955 EVT InVT = N->getOperand(0).getValueType();
29956 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&((VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits
() == 0 && "Expected a VT that divides into 128 bits."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29957, __PRETTY_FUNCTION__))
29957 "Expected a VT that divides into 128 bits.")((VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits
() == 0 && "Expected a VT that divides into 128 bits."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29957, __PRETTY_FUNCTION__))
;
29958 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29959, __PRETTY_FUNCTION__))
29959 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29959, __PRETTY_FUNCTION__))
;
29960 unsigned NumConcat = 128 / InVT.getSizeInBits();
29961
29962 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
29963 InVT.getVectorElementType(),
29964 NumConcat * InVT.getVectorNumElements());
29965 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
29966 VT.getVectorElementType(),
29967 NumConcat * VT.getVectorNumElements());
29968
29969 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
29970 Ops[0] = N->getOperand(0);
29971 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
29972 Ops[0] = N->getOperand(1);
29973 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
29974
29975 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
29976 Results.push_back(Res);
29977 return;
29978 }
29979 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
29980 case X86ISD::FMINC:
29981 case X86ISD::FMIN:
29982 case X86ISD::FMAXC:
29983 case X86ISD::FMAX: {
29984 EVT VT = N->getValueType(0);
29985 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")((VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29985, __PRETTY_FUNCTION__))
;
29986 SDValue UNDEF = DAG.getUNDEF(VT);
29987 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
29988 N->getOperand(0), UNDEF);
29989 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
29990 N->getOperand(1), UNDEF);
29991 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
29992 return;
29993 }
29994 case ISD::SDIV:
29995 case ISD::UDIV:
29996 case ISD::SREM:
29997 case ISD::UREM: {
29998 EVT VT = N->getValueType(0);
29999 if (VT.isVector()) {
30000 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30001, __PRETTY_FUNCTION__))
30001 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30001, __PRETTY_FUNCTION__))
;
30002 // If this RHS is a constant splat vector we can widen this and let
30003 // division/remainder by constant optimize it.
30004 // TODO: Can we do something for non-splat?
30005 APInt SplatVal;
30006 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
30007 unsigned NumConcats = 128 / VT.getSizeInBits();
30008 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
30009 Ops0[0] = N->getOperand(0);
30010 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
30011 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
30012 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
30013 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
30014 Results.push_back(Res);
30015 }
30016 return;
30017 }
30018
30019 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
30020 Results.push_back(V);
30021 return;
30022 }
30023 case ISD::TRUNCATE: {
30024 MVT VT = N->getSimpleValueType(0);
30025 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
30026 return;
30027
30028 // The generic legalizer will try to widen the input type to the same
30029 // number of elements as the widened result type. But this isn't always
30030 // the best thing so do some custom legalization to avoid some cases.
30031 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
30032 SDValue In = N->getOperand(0);
30033 EVT InVT = In.getValueType();
30034
30035 unsigned InBits = InVT.getSizeInBits();
30036 if (128 % InBits == 0) {
30037 // 128 bit and smaller inputs should avoid truncate all together and
30038 // just use a build_vector that will become a shuffle.
30039 // TODO: Widen and use a shuffle directly?
30040 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
30041 EVT EltVT = VT.getVectorElementType();
30042 unsigned WidenNumElts = WidenVT.getVectorNumElements();
30043 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
30044 // Use the original element count so we don't do more scalar opts than
30045 // necessary.
30046 unsigned MinElts = VT.getVectorNumElements();
30047 for (unsigned i=0; i < MinElts; ++i) {
30048 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
30049 DAG.getIntPtrConstant(i, dl));
30050 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
30051 }
30052 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
30053 return;
30054 }
30055 // With AVX512 there are some cases that can use a target specific
30056 // truncate node to go from 256/512 to less than 128 with zeros in the
30057 // upper elements of the 128 bit result.
30058 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
30059 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
30060 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
30061 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30062 return;
30063 }
30064 // There's one case we can widen to 512 bits and use VTRUNC.
30065 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
30066 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
30067 DAG.getUNDEF(MVT::v4i64));
30068 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30069 return;
30070 }
30071 }
30072 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
30073 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
30074 isTypeLegal(MVT::v4i64)) {
30075 // Input needs to be split and output needs to widened. Let's use two
30076 // VTRUNCs, and shuffle their results together into the wider type.
30077 SDValue Lo, Hi;
30078 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
30079
30080 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
30081 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
30082 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
30083 { 0, 1, 2, 3, 16, 17, 18, 19,
30084 -1, -1, -1, -1, -1, -1, -1, -1 });
30085 Results.push_back(Res);
30086 return;
30087 }
30088
30089 return;
30090 }
30091 case ISD::ANY_EXTEND:
30092 // Right now, only MVT::v8i8 has Custom action for an illegal type.
30093 // It's intended to custom handle the input type.
30094 assert(N->getValueType(0) == MVT::v8i8 &&((N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30095, __PRETTY_FUNCTION__))
30095 "Do not know how to legalize this Node")((N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30095, __PRETTY_FUNCTION__))
;
30096 return;
30097 case ISD::SIGN_EXTEND:
30098 case ISD::ZERO_EXTEND: {
30099 EVT VT = N->getValueType(0);
30100 SDValue In = N->getOperand(0);
30101 EVT InVT = In.getValueType();
30102 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
30103 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
30104 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30105, __PRETTY_FUNCTION__))
30105 "Unexpected type action!")((getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30105, __PRETTY_FUNCTION__))
;
30106 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")((N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30106, __PRETTY_FUNCTION__))
;
30107 // Custom split this so we can extend i8/i16->i32 invec. This is better
30108 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
30109 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
30110 // we allow the sra from the extend to i32 to be shared by the split.
30111 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
30112
30113 // Fill a vector with sign bits for each element.
30114 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
30115 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
30116
30117 // Create an unpackl and unpackh to interleave the sign bits then bitcast
30118 // to v2i64.
30119 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30120 {0, 4, 1, 5});
30121 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
30122 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30123 {2, 6, 3, 7});
30124 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
30125
30126 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30127 Results.push_back(Res);
30128 return;
30129 }
30130
30131 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
30132 if (!InVT.is128BitVector()) {
30133 // Not a 128 bit vector, but maybe type legalization will promote
30134 // it to 128 bits.
30135 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
30136 return;
30137 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
30138 if (!InVT.is128BitVector())
30139 return;
30140
30141 // Promote the input to 128 bits. Type legalization will turn this into
30142 // zext_inreg/sext_inreg.
30143 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
30144 }
30145
30146 // Perform custom splitting instead of the two stage extend we would get
30147 // by default.
30148 EVT LoVT, HiVT;
30149 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
30150 assert(isTypeLegal(LoVT) && "Split VT not legal?")((isTypeLegal(LoVT) && "Split VT not legal?") ? static_cast
<void> (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30150, __PRETTY_FUNCTION__))
;
30151
30152 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
30153
30154 // We need to shift the input over by half the number of elements.
30155 unsigned NumElts = InVT.getVectorNumElements();
30156 unsigned HalfNumElts = NumElts / 2;
30157 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
30158 for (unsigned i = 0; i != HalfNumElts; ++i)
30159 ShufMask[i] = i + HalfNumElts;
30160
30161 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
30162 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
30163
30164 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30165 Results.push_back(Res);
30166 }
30167 return;
30168 }
30169 case ISD::FP_TO_SINT:
30170 case ISD::STRICT_FP_TO_SINT:
30171 case ISD::FP_TO_UINT:
30172 case ISD::STRICT_FP_TO_UINT: {
30173 bool IsStrict = N->isStrictFPOpcode();
30174 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
30175 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
30176 EVT VT = N->getValueType(0);
30177 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30178 EVT SrcVT = Src.getValueType();
30179
30180 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
30181 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30182, __PRETTY_FUNCTION__))
30182 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30182, __PRETTY_FUNCTION__))
;
30183
30184 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
30185 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
30186 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
30187 VT.getVectorNumElements());
30188 SDValue Res;
30189 SDValue Chain;
30190 if (IsStrict) {
30191 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
30192 {N->getOperand(0), Src});
30193 Chain = Res.getValue(1);
30194 } else
30195 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
30196
30197 // Preserve what we know about the size of the original result. Except
30198 // when the result is v2i32 since we can't widen the assert.
30199 if (PromoteVT != MVT::v2i32)
30200 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
30201 dl, PromoteVT, Res,
30202 DAG.getValueType(VT.getVectorElementType()));
30203
30204 // Truncate back to the original width.
30205 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30206
30207 // Now widen to 128 bits.
30208 unsigned NumConcats = 128 / VT.getSizeInBits();
30209 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
30210 VT.getVectorNumElements() * NumConcats);
30211 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30212 ConcatOps[0] = Res;
30213 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
30214 Results.push_back(Res);
30215 if (IsStrict)
30216 Results.push_back(Chain);
30217 return;
30218 }
30219
30220
30221 if (VT == MVT::v2i32) {
30222 assert((IsSigned || Subtarget.hasAVX512()) &&(((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"
) ? static_cast<void> (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30223, __PRETTY_FUNCTION__))
30223 "Can only handle signed conversion without AVX512")(((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"
) ? static_cast<void> (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30223, __PRETTY_FUNCTION__))
;
30224 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30224, __PRETTY_FUNCTION__))
;
30225 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30226, __PRETTY_FUNCTION__))
30226 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30226, __PRETTY_FUNCTION__))
;
30227 if (Src.getValueType() == MVT::v2f64) {
30228 unsigned Opc;
30229 if (IsStrict)
30230 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
30231 else
30232 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
30233
30234 // If we have VLX we can emit a target specific FP_TO_UINT node,.
30235 if (!IsSigned && !Subtarget.hasVLX()) {
30236 // Otherwise we can defer to the generic legalizer which will widen
30237 // the input as well. This will be further widened during op
30238 // legalization to v8i32<-v8f64.
30239 // For strict nodes we'll need to widen ourselves.
30240 // FIXME: Fix the type legalizer to safely widen strict nodes?
30241 if (!IsStrict)
30242 return;
30243 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
30244 DAG.getConstantFP(0.0, dl, MVT::v2f64));
30245 Opc = N->getOpcode();
30246 }
30247 SDValue Res;
30248 SDValue Chain;
30249 if (IsStrict) {
30250 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
30251 {N->getOperand(0), Src});
30252 Chain = Res.getValue(1);
30253 } else {
30254 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
30255 }
30256 Results.push_back(Res);
30257 if (IsStrict)
30258 Results.push_back(Chain);
30259 return;
30260 }
30261
30262 // Custom widen strict v2f32->v2i32 by padding with zeros.
30263 // FIXME: Should generic type legalizer do this?
30264 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
30265 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
30266 DAG.getConstantFP(0.0, dl, MVT::v2f32));
30267 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
30268 {N->getOperand(0), Src});
30269 Results.push_back(Res);
30270 Results.push_back(Res.getValue(1));
30271 return;
30272 }
30273
30274 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
30275 // so early out here.
30276 return;
30277 }
30278
30279 assert(!VT.isVector() && "Vectors should have been handled above!")((!VT.isVector() && "Vectors should have been handled above!"
) ? static_cast<void> (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30279, __PRETTY_FUNCTION__))
;
30280
30281 if (Subtarget.hasDQI() && VT == MVT::i64 &&
30282 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
30283 assert(!Subtarget.is64Bit() && "i64 should be legal")((!Subtarget.is64Bit() && "i64 should be legal") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30283, __PRETTY_FUNCTION__))
;
30284 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
30285 // If we use a 128-bit result we might need to use a target specific node.
30286 unsigned SrcElts =
30287 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
30288 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
30289 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
30290 unsigned Opc = N->getOpcode();
30291 if (NumElts != SrcElts) {
30292 if (IsStrict)
30293 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
30294 else
30295 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
30296 }
30297
30298 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
30299 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
30300 DAG.getConstantFP(0.0, dl, VecInVT), Src,
30301 ZeroIdx);
30302 SDValue Chain;
30303 if (IsStrict) {
30304 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
30305 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
30306 Chain = Res.getValue(1);
30307 } else
30308 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
30309 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
30310 Results.push_back(Res);
30311 if (IsStrict)
30312 Results.push_back(Chain);
30313 return;
30314 }
30315
30316 SDValue Chain;
30317 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
30318 Results.push_back(V);
30319 if (IsStrict)
30320 Results.push_back(Chain);
30321 }
30322 return;
30323 }
30324 case ISD::LRINT:
30325 case ISD::LLRINT: {
30326 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
30327 Results.push_back(V);
30328 return;
30329 }
30330
30331 case ISD::SINT_TO_FP:
30332 case ISD::STRICT_SINT_TO_FP:
30333 case ISD::UINT_TO_FP:
30334 case ISD::STRICT_UINT_TO_FP: {
30335 bool IsStrict = N->isStrictFPOpcode();
30336 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
30337 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
30338 EVT VT = N->getValueType(0);
30339 if (VT != MVT::v2f32)
30340 return;
30341 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30342 EVT SrcVT = Src.getValueType();
30343 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
30344 if (IsStrict) {
30345 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
30346 : X86ISD::STRICT_CVTUI2P;
30347 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
30348 {N->getOperand(0), Src});
30349 Results.push_back(Res);
30350 Results.push_back(Res.getValue(1));
30351 } else {
30352 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
30353 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
30354 }
30355 return;
30356 }
30357 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
30358 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
30359 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
30360 SDValue One = DAG.getConstant(1, dl, SrcVT);
30361 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
30362 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
30363 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
30364 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
30365 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
30366 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
30367 for (int i = 0; i != 2; ++i) {
30368 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
30369 SignSrc, DAG.getIntPtrConstant(i, dl));
30370 if (IsStrict)
30371 SignCvts[i] =
30372 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
30373 {N->getOperand(0), Elt});
30374 else
30375 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
30376 };
30377 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
30378 SDValue Slow, Chain;
30379 if (IsStrict) {
30380 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
30381 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
30382 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
30383 {Chain, SignCvt, SignCvt});
30384 Chain = Slow.getValue(1);
30385 } else {
30386 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
30387 }
30388 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
30389 IsNeg =
30390 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
30391 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
30392 Results.push_back(Cvt);
30393 if (IsStrict)
30394 Results.push_back(Chain);
30395 return;
30396 }
30397
30398 if (SrcVT != MVT::v2i32)
30399 return;
30400
30401 if (IsSigned || Subtarget.hasAVX512()) {
30402 if (!IsStrict)
30403 return;
30404
30405 // Custom widen strict v2i32->v2f32 to avoid scalarization.
30406 // FIXME: Should generic type legalizer do this?
30407 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
30408 DAG.getConstant(0, dl, MVT::v2i32));
30409 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
30410 {N->getOperand(0), Src});
30411 Results.push_back(Res);
30412 Results.push_back(Res.getValue(1));
30413 return;
30414 }
30415
30416 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30416, __PRETTY_FUNCTION__))
;
30417 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
30418 SDValue VBias =
30419 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
30420 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
30421 DAG.getBitcast(MVT::v2i64, VBias));
30422 Or = DAG.getBitcast(MVT::v2f64, Or);
30423 if (IsStrict) {
30424 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
30425 {N->getOperand(0), Or, VBias});
30426 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
30427 {MVT::v4f32, MVT::Other},
30428 {Sub.getValue(1), Sub});
30429 Results.push_back(Res);
30430 Results.push_back(Res.getValue(1));
30431 } else {
30432 // TODO: Are there any fast-math-flags to propagate here?
30433 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
30434 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
30435 }
30436 return;
30437 }
30438 case ISD::STRICT_FP_ROUND:
30439 case ISD::FP_ROUND: {
30440 bool IsStrict = N->isStrictFPOpcode();
30441 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30442 if (!isTypeLegal(Src.getValueType()))
30443 return;
30444 SDValue V;
30445 if (IsStrict)
30446 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
30447 {N->getOperand(0), N->getOperand(1)});
30448 else
30449 V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
30450 Results.push_back(V);
30451 if (IsStrict)
30452 Results.push_back(V.getValue(1));
30453 return;
30454 }
30455 case ISD::FP_EXTEND:
30456 case ISD::STRICT_FP_EXTEND: {
30457 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
30458 // No other ValueType for FP_EXTEND should reach this point.
30459 assert(N->getValueType(0) == MVT::v2f32 &&((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30460, __PRETTY_FUNCTION__))
30460 "Do not know how to legalize this Node")((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30460, __PRETTY_FUNCTION__))
;
30461 return;
30462 }
30463 case ISD::INTRINSIC_W_CHAIN: {
30464 unsigned IntNo = N->getConstantOperandVal(1);
30465 switch (IntNo) {
30466 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30467)
30467 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30467)
;
30468 case Intrinsic::x86_rdtsc:
30469 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
30470 Results);
30471 case Intrinsic::x86_rdtscp:
30472 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
30473 Results);
30474 case Intrinsic::x86_rdpmc:
30475 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
30476 Results);
30477 return;
30478 case Intrinsic::x86_xgetbv:
30479 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
30480 Results);
30481 return;
30482 }
30483 }
30484 case ISD::READCYCLECOUNTER: {
30485 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
30486 }
30487 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
30488 EVT T = N->getValueType(0);
30489 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"
) ? static_cast<void> (0) : __assert_fail ("(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30489, __PRETTY_FUNCTION__))
;
30490 bool Regs64bit = T == MVT::i128;
30491 assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&(((!Regs64bit || Subtarget.hasCmpxchg16b()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? static_cast<void> (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30492, __PRETTY_FUNCTION__))
30492 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(((!Regs64bit || Subtarget.hasCmpxchg16b()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? static_cast<void> (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30492, __PRETTY_FUNCTION__))
;
30493 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
30494 SDValue cpInL, cpInH;
30495 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
30496 DAG.getConstant(0, dl, HalfT));
30497 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
30498 DAG.getConstant(1, dl, HalfT));
30499 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
30500 Regs64bit ? X86::RAX : X86::EAX,
30501 cpInL, SDValue());
30502 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
30503 Regs64bit ? X86::RDX : X86::EDX,
30504 cpInH, cpInL.getValue(1));
30505 SDValue swapInL, swapInH;
30506 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
30507 DAG.getConstant(0, dl, HalfT));
30508 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
30509 DAG.getConstant(1, dl, HalfT));
30510 swapInH =
30511 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
30512 swapInH, cpInH.getValue(1));
30513
30514 // In 64-bit mode we might need the base pointer in RBX, but we can't know
30515 // until later. So we keep the RBX input in a vreg and use a custom
30516 // inserter.
30517 // Since RBX will be a reserved register the register allocator will not
30518 // make sure its value will be properly saved and restored around this
30519 // live-range.
30520 SDValue Result;
30521 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
30522 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
30523 if (Regs64bit) {
30524 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
30525 swapInH.getValue(1)};
30526 Result =
30527 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
30528 } else {
30529 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
30530 swapInH.getValue(1));
30531 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
30532 swapInL.getValue(1)};
30533 Result =
30534 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
30535 }
30536
30537 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
30538 Regs64bit ? X86::RAX : X86::EAX,
30539 HalfT, Result.getValue(1));
30540 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
30541 Regs64bit ? X86::RDX : X86::EDX,
30542 HalfT, cpOutL.getValue(2));
30543 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
30544
30545 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
30546 MVT::i32, cpOutH.getValue(2));
30547 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
30548 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
30549
30550 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
30551 Results.push_back(Success);
30552 Results.push_back(EFLAGS.getValue(1));
30553 return;
30554 }
30555 case ISD::ATOMIC_LOAD: {
30556 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((N->getValueType(0) == MVT::i64 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30556, __PRETTY_FUNCTION__))
;
30557 bool NoImplicitFloatOps =
30558 DAG.getMachineFunction().getFunction().hasFnAttribute(
30559 Attribute::NoImplicitFloat);
30560 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
30561 auto *Node = cast<AtomicSDNode>(N);
30562 if (Subtarget.hasSSE1()) {
30563 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
30564 // Then extract the lower 64-bits.
30565 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
30566 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
30567 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
30568 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
30569 MVT::i64, Node->getMemOperand());
30570 if (Subtarget.hasSSE2()) {
30571 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
30572 DAG.getIntPtrConstant(0, dl));
30573 Results.push_back(Res);
30574 Results.push_back(Ld.getValue(1));
30575 return;
30576 }
30577 // We use an alternative sequence for SSE1 that extracts as v2f32 and
30578 // then casts to i64. This avoids a 128-bit stack temporary being
30579 // created by type legalization if we were to cast v4f32->v2i64.
30580 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
30581 DAG.getIntPtrConstant(0, dl));
30582 Res = DAG.getBitcast(MVT::i64, Res);
30583 Results.push_back(Res);
30584 Results.push_back(Ld.getValue(1));
30585 return;
30586 }
30587 if (Subtarget.hasX87()) {
30588 // First load this into an 80-bit X87 register. This will put the whole
30589 // integer into the significand.
30590 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
30591 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
30592 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
30593 dl, Tys, Ops, MVT::i64,
30594 Node->getMemOperand());
30595 SDValue Chain = Result.getValue(1);
30596
30597 // Now store the X87 register to a stack temporary and convert to i64.
30598 // This store is not atomic and doesn't need to be.
30599 // FIXME: We don't need a stack temporary if the result of the load
30600 // is already being stored. We could just directly store there.
30601 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
30602 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30603 MachinePointerInfo MPI =
30604 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
30605 SDValue StoreOps[] = { Chain, Result, StackPtr };
30606 Chain = DAG.getMemIntrinsicNode(
30607 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
30608 MPI, None /*Align*/, MachineMemOperand::MOStore);
30609
30610 // Finally load the value back from the stack temporary and return it.
30611 // This load is not atomic and doesn't need to be.
30612 // This load will be further type legalized.
30613 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
30614 Results.push_back(Result);
30615 Results.push_back(Result.getValue(1));
30616 return;
30617 }
30618 }
30619 // TODO: Use MOVLPS when SSE1 is available?
30620 // Delegate to generic TypeLegalization. Situations we can really handle
30621 // should have already been dealt with by AtomicExpandPass.cpp.
30622 break;
30623 }
30624 case ISD::ATOMIC_SWAP:
30625 case ISD::ATOMIC_LOAD_ADD:
30626 case ISD::ATOMIC_LOAD_SUB:
30627 case ISD::ATOMIC_LOAD_AND:
30628 case ISD::ATOMIC_LOAD_OR:
30629 case ISD::ATOMIC_LOAD_XOR:
30630 case ISD::ATOMIC_LOAD_NAND:
30631 case ISD::ATOMIC_LOAD_MIN:
30632 case ISD::ATOMIC_LOAD_MAX:
30633 case ISD::ATOMIC_LOAD_UMIN:
30634 case ISD::ATOMIC_LOAD_UMAX:
30635 // Delegate to generic TypeLegalization. Situations we can really handle
30636 // should have already been dealt with by AtomicExpandPass.cpp.
30637 break;
30638
30639 case ISD::BITCAST: {
30640 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30640, __PRETTY_FUNCTION__))
;
30641 EVT DstVT = N->getValueType(0);
30642 EVT SrcVT = N->getOperand(0).getValueType();
30643
30644 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
30645 // we can split using the k-register rather than memory.
30646 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
30647 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((!Subtarget.is64Bit() && "Expected 32-bit mode") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30647, __PRETTY_FUNCTION__))
;
30648 SDValue Lo, Hi;
30649 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
30650 Lo = DAG.getBitcast(MVT::i32, Lo);
30651 Hi = DAG.getBitcast(MVT::i32, Hi);
30652 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
30653 Results.push_back(Res);
30654 return;
30655 }
30656
30657 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
30658 // FIXME: Use v4f32 for SSE1?
30659 assert(Subtarget.hasSSE2() && "Requires SSE2")((Subtarget.hasSSE2() && "Requires SSE2") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30659, __PRETTY_FUNCTION__))
;
30660 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30661, __PRETTY_FUNCTION__))
30661 "Unexpected type action!")((getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30661, __PRETTY_FUNCTION__))
;
30662 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
30663 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
30664 N->getOperand(0));
30665 Res = DAG.getBitcast(WideVT, Res);
30666 Results.push_back(Res);
30667 return;
30668 }
30669
30670 return;
30671 }
30672 case ISD::MGATHER: {
30673 EVT VT = N->getValueType(0);
30674 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
30675 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
30676 auto *Gather = cast<MaskedGatherSDNode>(N);
30677 SDValue Index = Gather->getIndex();
30678 if (Index.getValueType() != MVT::v2i64)
30679 return;
30680 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30681, __PRETTY_FUNCTION__))
30681 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30681, __PRETTY_FUNCTION__))
;
30682 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
30683 SDValue Mask = Gather->getMask();
30684 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"
) ? static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30684, __PRETTY_FUNCTION__))
;
30685 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
30686 Gather->getPassThru(),
30687 DAG.getUNDEF(VT));
30688 if (!Subtarget.hasVLX()) {
30689 // We need to widen the mask, but the instruction will only use 2
30690 // of its elements. So we can use undef.
30691 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
30692 DAG.getUNDEF(MVT::v2i1));
30693 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
30694 }
30695 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
30696 Gather->getBasePtr(), Index, Gather->getScale() };
30697 SDValue Res = DAG.getMemIntrinsicNode(
30698 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
30699 Gather->getMemoryVT(), Gather->getMemOperand());
30700 Results.push_back(Res);
30701 Results.push_back(Res.getValue(1));
30702 return;
30703 }
30704 return;
30705 }
30706 case ISD::LOAD: {
30707 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
30708 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
30709 // cast since type legalization will try to use an i64 load.
30710 MVT VT = N->getSimpleValueType(0);
30711 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")((VT.isVector() && VT.getSizeInBits() == 64 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30711, __PRETTY_FUNCTION__))
;
30712 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30713, __PRETTY_FUNCTION__))
30713 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30713, __PRETTY_FUNCTION__))
;
30714 if (!ISD::isNON_EXTLoad(N))
30715 return;
30716 auto *Ld = cast<LoadSDNode>(N);
30717 if (Subtarget.hasSSE2()) {
30718 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
30719 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
30720 Ld->getPointerInfo(), Ld->getOriginalAlign(),
30721 Ld->getMemOperand()->getFlags());
30722 SDValue Chain = Res.getValue(1);
30723 MVT VecVT = MVT::getVectorVT(LdVT, 2);
30724 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
30725 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
30726 Res = DAG.getBitcast(WideVT, Res);
30727 Results.push_back(Res);
30728 Results.push_back(Chain);
30729 return;
30730 }
30731 assert(Subtarget.hasSSE1() && "Expected SSE")((Subtarget.hasSSE1() && "Expected SSE") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30731, __PRETTY_FUNCTION__))
;
30732 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
30733 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
30734 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
30735 MVT::i64, Ld->getMemOperand());
30736 Results.push_back(Res);
30737 Results.push_back(Res.getValue(1));
30738 return;
30739 }
30740 case ISD::ADDRSPACECAST: {
30741 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
30742 Results.push_back(V);
30743 return;
30744 }
30745 case ISD::BITREVERSE:
30746 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((N->getValueType(0) == MVT::i64 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30746, __PRETTY_FUNCTION__))
;
30747 assert(Subtarget.hasXOP() && "Expected XOP")((Subtarget.hasXOP() && "Expected XOP") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30747, __PRETTY_FUNCTION__))
;
30748 // We can use VPPERM by copying to a vector register and back. We'll need
30749 // to move the scalar in two i32 pieces.
30750 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
30751 return;
30752 }
30753}
30754
30755const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
30756 switch ((X86ISD::NodeType)Opcode) {
30757 case X86ISD::FIRST_NUMBER: break;
30758#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
30759 NODE_NAME_CASE(BSF)
30760 NODE_NAME_CASE(BSR)
30761 NODE_NAME_CASE(FSHL)
30762 NODE_NAME_CASE(FSHR)
30763 NODE_NAME_CASE(FAND)
30764 NODE_NAME_CASE(FANDN)
30765 NODE_NAME_CASE(FOR)
30766 NODE_NAME_CASE(FXOR)
30767 NODE_NAME_CASE(FILD)
30768 NODE_NAME_CASE(FIST)
30769 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
30770 NODE_NAME_CASE(FLD)
30771 NODE_NAME_CASE(FST)
30772 NODE_NAME_CASE(CALL)
30773 NODE_NAME_CASE(BT)
30774 NODE_NAME_CASE(CMP)
30775 NODE_NAME_CASE(FCMP)
30776 NODE_NAME_CASE(STRICT_FCMP)
30777 NODE_NAME_CASE(STRICT_FCMPS)
30778 NODE_NAME_CASE(COMI)
30779 NODE_NAME_CASE(UCOMI)
30780 NODE_NAME_CASE(CMPM)
30781 NODE_NAME_CASE(CMPMM)
30782 NODE_NAME_CASE(STRICT_CMPM)
30783 NODE_NAME_CASE(CMPMM_SAE)
30784 NODE_NAME_CASE(SETCC)
30785 NODE_NAME_CASE(SETCC_CARRY)
30786 NODE_NAME_CASE(FSETCC)
30787 NODE_NAME_CASE(FSETCCM)
30788 NODE_NAME_CASE(FSETCCM_SAE)
30789 NODE_NAME_CASE(CMOV)
30790 NODE_NAME_CASE(BRCOND)
30791 NODE_NAME_CASE(RET_FLAG)
30792 NODE_NAME_CASE(IRET)
30793 NODE_NAME_CASE(REP_STOS)
30794 NODE_NAME_CASE(REP_MOVS)
30795 NODE_NAME_CASE(GlobalBaseReg)
30796 NODE_NAME_CASE(Wrapper)
30797 NODE_NAME_CASE(WrapperRIP)
30798 NODE_NAME_CASE(MOVQ2DQ)
30799 NODE_NAME_CASE(MOVDQ2Q)
30800 NODE_NAME_CASE(MMX_MOVD2W)
30801 NODE_NAME_CASE(MMX_MOVW2D)
30802 NODE_NAME_CASE(PEXTRB)
30803 NODE_NAME_CASE(PEXTRW)
30804 NODE_NAME_CASE(INSERTPS)
30805 NODE_NAME_CASE(PINSRB)
30806 NODE_NAME_CASE(PINSRW)
30807 NODE_NAME_CASE(PSHUFB)
30808 NODE_NAME_CASE(ANDNP)
30809 NODE_NAME_CASE(BLENDI)
30810 NODE_NAME_CASE(BLENDV)
30811 NODE_NAME_CASE(HADD)
30812 NODE_NAME_CASE(HSUB)
30813 NODE_NAME_CASE(FHADD)
30814 NODE_NAME_CASE(FHSUB)
30815 NODE_NAME_CASE(CONFLICT)
30816 NODE_NAME_CASE(FMAX)
30817 NODE_NAME_CASE(FMAXS)
30818 NODE_NAME_CASE(FMAX_SAE)
30819 NODE_NAME_CASE(FMAXS_SAE)
30820 NODE_NAME_CASE(FMIN)
30821 NODE_NAME_CASE(FMINS)
30822 NODE_NAME_CASE(FMIN_SAE)
30823 NODE_NAME_CASE(FMINS_SAE)
30824 NODE_NAME_CASE(FMAXC)
30825 NODE_NAME_CASE(FMINC)
30826 NODE_NAME_CASE(FRSQRT)
30827 NODE_NAME_CASE(FRCP)
30828 NODE_NAME_CASE(EXTRQI)
30829 NODE_NAME_CASE(INSERTQI)
30830 NODE_NAME_CASE(TLSADDR)
30831 NODE_NAME_CASE(TLSBASEADDR)
30832 NODE_NAME_CASE(TLSCALL)
30833 NODE_NAME_CASE(EH_SJLJ_SETJMP)
30834 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
30835 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
30836 NODE_NAME_CASE(EH_RETURN)
30837 NODE_NAME_CASE(TC_RETURN)
30838 NODE_NAME_CASE(FNSTCW16m)
30839 NODE_NAME_CASE(LCMPXCHG_DAG)
30840 NODE_NAME_CASE(LCMPXCHG8_DAG)
30841 NODE_NAME_CASE(LCMPXCHG16_DAG)
30842 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
30843 NODE_NAME_CASE(LADD)
30844 NODE_NAME_CASE(LSUB)
30845 NODE_NAME_CASE(LOR)
30846 NODE_NAME_CASE(LXOR)
30847 NODE_NAME_CASE(LAND)
30848 NODE_NAME_CASE(VZEXT_MOVL)
30849 NODE_NAME_CASE(VZEXT_LOAD)
30850 NODE_NAME_CASE(VEXTRACT_STORE)
30851 NODE_NAME_CASE(VTRUNC)
30852 NODE_NAME_CASE(VTRUNCS)
30853 NODE_NAME_CASE(VTRUNCUS)
30854 NODE_NAME_CASE(VMTRUNC)
30855 NODE_NAME_CASE(VMTRUNCS)
30856 NODE_NAME_CASE(VMTRUNCUS)
30857 NODE_NAME_CASE(VTRUNCSTORES)
30858 NODE_NAME_CASE(VTRUNCSTOREUS)
30859 NODE_NAME_CASE(VMTRUNCSTORES)
30860 NODE_NAME_CASE(VMTRUNCSTOREUS)
30861 NODE_NAME_CASE(VFPEXT)
30862 NODE_NAME_CASE(STRICT_VFPEXT)
30863 NODE_NAME_CASE(VFPEXT_SAE)
30864 NODE_NAME_CASE(VFPEXTS)
30865 NODE_NAME_CASE(VFPEXTS_SAE)
30866 NODE_NAME_CASE(VFPROUND)
30867 NODE_NAME_CASE(STRICT_VFPROUND)
30868 NODE_NAME_CASE(VMFPROUND)
30869 NODE_NAME_CASE(VFPROUND_RND)
30870 NODE_NAME_CASE(VFPROUNDS)
30871 NODE_NAME_CASE(VFPROUNDS_RND)
30872 NODE_NAME_CASE(VSHLDQ)
30873 NODE_NAME_CASE(VSRLDQ)
30874 NODE_NAME_CASE(VSHL)
30875 NODE_NAME_CASE(VSRL)
30876 NODE_NAME_CASE(VSRA)
30877 NODE_NAME_CASE(VSHLI)
30878 NODE_NAME_CASE(VSRLI)
30879 NODE_NAME_CASE(VSRAI)
30880 NODE_NAME_CASE(VSHLV)
30881 NODE_NAME_CASE(VSRLV)
30882 NODE_NAME_CASE(VSRAV)
30883 NODE_NAME_CASE(VROTLI)
30884 NODE_NAME_CASE(VROTRI)
30885 NODE_NAME_CASE(VPPERM)
30886 NODE_NAME_CASE(CMPP)
30887 NODE_NAME_CASE(STRICT_CMPP)
30888 NODE_NAME_CASE(PCMPEQ)
30889 NODE_NAME_CASE(PCMPGT)
30890 NODE_NAME_CASE(PHMINPOS)
30891 NODE_NAME_CASE(ADD)
30892 NODE_NAME_CASE(SUB)
30893 NODE_NAME_CASE(ADC)
30894 NODE_NAME_CASE(SBB)
30895 NODE_NAME_CASE(SMUL)
30896 NODE_NAME_CASE(UMUL)
30897 NODE_NAME_CASE(OR)
30898 NODE_NAME_CASE(XOR)
30899 NODE_NAME_CASE(AND)
30900 NODE_NAME_CASE(BEXTR)
30901 NODE_NAME_CASE(BEXTRI)
30902 NODE_NAME_CASE(BZHI)
30903 NODE_NAME_CASE(PDEP)
30904 NODE_NAME_CASE(PEXT)
30905 NODE_NAME_CASE(MUL_IMM)
30906 NODE_NAME_CASE(MOVMSK)
30907 NODE_NAME_CASE(PTEST)
30908 NODE_NAME_CASE(TESTP)
30909 NODE_NAME_CASE(KORTEST)
30910 NODE_NAME_CASE(KTEST)
30911 NODE_NAME_CASE(KADD)
30912 NODE_NAME_CASE(KSHIFTL)
30913 NODE_NAME_CASE(KSHIFTR)
30914 NODE_NAME_CASE(PACKSS)
30915 NODE_NAME_CASE(PACKUS)
30916 NODE_NAME_CASE(PALIGNR)
30917 NODE_NAME_CASE(VALIGN)
30918 NODE_NAME_CASE(VSHLD)
30919 NODE_NAME_CASE(VSHRD)
30920 NODE_NAME_CASE(VSHLDV)
30921 NODE_NAME_CASE(VSHRDV)
30922 NODE_NAME_CASE(PSHUFD)
30923 NODE_NAME_CASE(PSHUFHW)
30924 NODE_NAME_CASE(PSHUFLW)
30925 NODE_NAME_CASE(SHUFP)
30926 NODE_NAME_CASE(SHUF128)
30927 NODE_NAME_CASE(MOVLHPS)
30928 NODE_NAME_CASE(MOVHLPS)
30929 NODE_NAME_CASE(MOVDDUP)
30930 NODE_NAME_CASE(MOVSHDUP)
30931 NODE_NAME_CASE(MOVSLDUP)
30932 NODE_NAME_CASE(MOVSD)
30933 NODE_NAME_CASE(MOVSS)
30934 NODE_NAME_CASE(UNPCKL)
30935 NODE_NAME_CASE(UNPCKH)
30936 NODE_NAME_CASE(VBROADCAST)
30937 NODE_NAME_CASE(VBROADCAST_LOAD)
30938 NODE_NAME_CASE(VBROADCASTM)
30939 NODE_NAME_CASE(SUBV_BROADCAST)
30940 NODE_NAME_CASE(VPERMILPV)
30941 NODE_NAME_CASE(VPERMILPI)
30942 NODE_NAME_CASE(VPERM2X128)
30943 NODE_NAME_CASE(VPERMV)
30944 NODE_NAME_CASE(VPERMV3)
30945 NODE_NAME_CASE(VPERMI)
30946 NODE_NAME_CASE(VPTERNLOG)
30947 NODE_NAME_CASE(VFIXUPIMM)
30948 NODE_NAME_CASE(VFIXUPIMM_SAE)
30949 NODE_NAME_CASE(VFIXUPIMMS)
30950 NODE_NAME_CASE(VFIXUPIMMS_SAE)
30951 NODE_NAME_CASE(VRANGE)
30952 NODE_NAME_CASE(VRANGE_SAE)
30953 NODE_NAME_CASE(VRANGES)
30954 NODE_NAME_CASE(VRANGES_SAE)
30955 NODE_NAME_CASE(PMULUDQ)
30956 NODE_NAME_CASE(PMULDQ)
30957 NODE_NAME_CASE(PSADBW)
30958 NODE_NAME_CASE(DBPSADBW)
30959 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
30960 NODE_NAME_CASE(VAARG_64)
30961 NODE_NAME_CASE(WIN_ALLOCA)
30962 NODE_NAME_CASE(MEMBARRIER)
30963 NODE_NAME_CASE(MFENCE)
30964 NODE_NAME_CASE(SEG_ALLOCA)
30965 NODE_NAME_CASE(PROBED_ALLOCA)
30966 NODE_NAME_CASE(RDRAND)
30967 NODE_NAME_CASE(RDSEED)
30968 NODE_NAME_CASE(RDPKRU)
30969 NODE_NAME_CASE(WRPKRU)
30970 NODE_NAME_CASE(VPMADDUBSW)
30971 NODE_NAME_CASE(VPMADDWD)
30972 NODE_NAME_CASE(VPSHA)
30973 NODE_NAME_CASE(VPSHL)
30974 NODE_NAME_CASE(VPCOM)
30975 NODE_NAME_CASE(VPCOMU)
30976 NODE_NAME_CASE(VPERMIL2)
30977 NODE_NAME_CASE(FMSUB)
30978 NODE_NAME_CASE(STRICT_FMSUB)
30979 NODE_NAME_CASE(FNMADD)
30980 NODE_NAME_CASE(STRICT_FNMADD)
30981 NODE_NAME_CASE(FNMSUB)
30982 NODE_NAME_CASE(STRICT_FNMSUB)
30983 NODE_NAME_CASE(FMADDSUB)
30984 NODE_NAME_CASE(FMSUBADD)
30985 NODE_NAME_CASE(FMADD_RND)
30986 NODE_NAME_CASE(FNMADD_RND)
30987 NODE_NAME_CASE(FMSUB_RND)
30988 NODE_NAME_CASE(FNMSUB_RND)
30989 NODE_NAME_CASE(FMADDSUB_RND)
30990 NODE_NAME_CASE(FMSUBADD_RND)
30991 NODE_NAME_CASE(VPMADD52H)
30992 NODE_NAME_CASE(VPMADD52L)
30993 NODE_NAME_CASE(VRNDSCALE)
30994 NODE_NAME_CASE(STRICT_VRNDSCALE)
30995 NODE_NAME_CASE(VRNDSCALE_SAE)
30996 NODE_NAME_CASE(VRNDSCALES)
30997 NODE_NAME_CASE(VRNDSCALES_SAE)
30998 NODE_NAME_CASE(VREDUCE)
30999 NODE_NAME_CASE(VREDUCE_SAE)
31000 NODE_NAME_CASE(VREDUCES)
31001 NODE_NAME_CASE(VREDUCES_SAE)
31002 NODE_NAME_CASE(VGETMANT)
31003 NODE_NAME_CASE(VGETMANT_SAE)
31004 NODE_NAME_CASE(VGETMANTS)
31005 NODE_NAME_CASE(VGETMANTS_SAE)
31006 NODE_NAME_CASE(PCMPESTR)
31007 NODE_NAME_CASE(PCMPISTR)
31008 NODE_NAME_CASE(XTEST)
31009 NODE_NAME_CASE(COMPRESS)
31010 NODE_NAME_CASE(EXPAND)
31011 NODE_NAME_CASE(SELECTS)
31012 NODE_NAME_CASE(ADDSUB)
31013 NODE_NAME_CASE(RCP14)
31014 NODE_NAME_CASE(RCP14S)
31015 NODE_NAME_CASE(RCP28)
31016 NODE_NAME_CASE(RCP28_SAE)
31017 NODE_NAME_CASE(RCP28S)
31018 NODE_NAME_CASE(RCP28S_SAE)
31019 NODE_NAME_CASE(EXP2)
31020 NODE_NAME_CASE(EXP2_SAE)
31021 NODE_NAME_CASE(RSQRT14)
31022 NODE_NAME_CASE(RSQRT14S)
31023 NODE_NAME_CASE(RSQRT28)
31024 NODE_NAME_CASE(RSQRT28_SAE)
31025 NODE_NAME_CASE(RSQRT28S)
31026 NODE_NAME_CASE(RSQRT28S_SAE)
31027 NODE_NAME_CASE(FADD_RND)
31028 NODE_NAME_CASE(FADDS)
31029 NODE_NAME_CASE(FADDS_RND)
31030 NODE_NAME_CASE(FSUB_RND)
31031 NODE_NAME_CASE(FSUBS)
31032 NODE_NAME_CASE(FSUBS_RND)
31033 NODE_NAME_CASE(FMUL_RND)
31034 NODE_NAME_CASE(FMULS)
31035 NODE_NAME_CASE(FMULS_RND)
31036 NODE_NAME_CASE(FDIV_RND)
31037 NODE_NAME_CASE(FDIVS)
31038 NODE_NAME_CASE(FDIVS_RND)
31039 NODE_NAME_CASE(FSQRT_RND)
31040 NODE_NAME_CASE(FSQRTS)
31041 NODE_NAME_CASE(FSQRTS_RND)
31042 NODE_NAME_CASE(FGETEXP)
31043 NODE_NAME_CASE(FGETEXP_SAE)
31044 NODE_NAME_CASE(FGETEXPS)
31045 NODE_NAME_CASE(FGETEXPS_SAE)
31046 NODE_NAME_CASE(SCALEF)
31047 NODE_NAME_CASE(SCALEF_RND)
31048 NODE_NAME_CASE(SCALEFS)
31049 NODE_NAME_CASE(SCALEFS_RND)
31050 NODE_NAME_CASE(AVG)
31051 NODE_NAME_CASE(MULHRS)
31052 NODE_NAME_CASE(SINT_TO_FP_RND)
31053 NODE_NAME_CASE(UINT_TO_FP_RND)
31054 NODE_NAME_CASE(CVTTP2SI)
31055 NODE_NAME_CASE(CVTTP2UI)
31056 NODE_NAME_CASE(STRICT_CVTTP2SI)
31057 NODE_NAME_CASE(STRICT_CVTTP2UI)
31058 NODE_NAME_CASE(MCVTTP2SI)
31059 NODE_NAME_CASE(MCVTTP2UI)
31060 NODE_NAME_CASE(CVTTP2SI_SAE)
31061 NODE_NAME_CASE(CVTTP2UI_SAE)
31062 NODE_NAME_CASE(CVTTS2SI)
31063 NODE_NAME_CASE(CVTTS2UI)
31064 NODE_NAME_CASE(CVTTS2SI_SAE)
31065 NODE_NAME_CASE(CVTTS2UI_SAE)
31066 NODE_NAME_CASE(CVTSI2P)
31067 NODE_NAME_CASE(CVTUI2P)
31068 NODE_NAME_CASE(STRICT_CVTSI2P)
31069 NODE_NAME_CASE(STRICT_CVTUI2P)
31070 NODE_NAME_CASE(MCVTSI2P)
31071 NODE_NAME_CASE(MCVTUI2P)
31072 NODE_NAME_CASE(VFPCLASS)
31073 NODE_NAME_CASE(VFPCLASSS)
31074 NODE_NAME_CASE(MULTISHIFT)
31075 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
31076 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
31077 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
31078 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
31079 NODE_NAME_CASE(CVTPS2PH)
31080 NODE_NAME_CASE(STRICT_CVTPS2PH)
31081 NODE_NAME_CASE(MCVTPS2PH)
31082 NODE_NAME_CASE(CVTPH2PS)
31083 NODE_NAME_CASE(STRICT_CVTPH2PS)
31084 NODE_NAME_CASE(CVTPH2PS_SAE)
31085 NODE_NAME_CASE(CVTP2SI)
31086 NODE_NAME_CASE(CVTP2UI)
31087 NODE_NAME_CASE(MCVTP2SI)
31088 NODE_NAME_CASE(MCVTP2UI)
31089 NODE_NAME_CASE(CVTP2SI_RND)
31090 NODE_NAME_CASE(CVTP2UI_RND)
31091 NODE_NAME_CASE(CVTS2SI)
31092 NODE_NAME_CASE(CVTS2UI)
31093 NODE_NAME_CASE(CVTS2SI_RND)
31094 NODE_NAME_CASE(CVTS2UI_RND)
31095 NODE_NAME_CASE(CVTNE2PS2BF16)
31096 NODE_NAME_CASE(CVTNEPS2BF16)
31097 NODE_NAME_CASE(MCVTNEPS2BF16)
31098 NODE_NAME_CASE(DPBF16PS)
31099 NODE_NAME_CASE(LWPINS)
31100 NODE_NAME_CASE(MGATHER)
31101 NODE_NAME_CASE(MSCATTER)
31102 NODE_NAME_CASE(VPDPBUSD)
31103 NODE_NAME_CASE(VPDPBUSDS)
31104 NODE_NAME_CASE(VPDPWSSD)
31105 NODE_NAME_CASE(VPDPWSSDS)
31106 NODE_NAME_CASE(VPSHUFBITQMB)
31107 NODE_NAME_CASE(GF2P8MULB)
31108 NODE_NAME_CASE(GF2P8AFFINEQB)
31109 NODE_NAME_CASE(GF2P8AFFINEINVQB)
31110 NODE_NAME_CASE(NT_CALL)
31111 NODE_NAME_CASE(NT_BRIND)
31112 NODE_NAME_CASE(UMWAIT)
31113 NODE_NAME_CASE(TPAUSE)
31114 NODE_NAME_CASE(ENQCMD)
31115 NODE_NAME_CASE(ENQCMDS)
31116 NODE_NAME_CASE(VP2INTERSECT)
31117 NODE_NAME_CASE(AESENC128KL)
31118 NODE_NAME_CASE(AESDEC128KL)
31119 NODE_NAME_CASE(AESENC256KL)
31120 NODE_NAME_CASE(AESDEC256KL)
31121 NODE_NAME_CASE(AESENCWIDE128KL)
31122 NODE_NAME_CASE(AESDECWIDE128KL)
31123 NODE_NAME_CASE(AESENCWIDE256KL)
31124 NODE_NAME_CASE(AESDECWIDE256KL)
31125 NODE_NAME_CASE(TESTUI)
31126 }
31127 return nullptr;
31128#undef NODE_NAME_CASE
31129}
31130
31131/// Return true if the addressing mode represented by AM is legal for this
31132/// target, for a load/store of the specified type.
31133bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
31134 const AddrMode &AM, Type *Ty,
31135 unsigned AS,
31136 Instruction *I) const {
31137 // X86 supports extremely general addressing modes.
31138 CodeModel::Model M = getTargetMachine().getCodeModel();
31139
31140 // X86 allows a sign-extended 32-bit immediate field as a displacement.
31141 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
31142 return false;
31143
31144 if (AM.BaseGV) {
31145 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
31146
31147 // If a reference to this global requires an extra load, we can't fold it.
31148 if (isGlobalStubReference(GVFlags))
31149 return false;
31150
31151 // If BaseGV requires a register for the PIC base, we cannot also have a
31152 // BaseReg specified.
31153 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
31154 return false;
31155
31156 // If lower 4G is not available, then we must use rip-relative addressing.
31157 if ((M != CodeModel::Small || isPositionIndependent()) &&
31158 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
31159 return false;
31160 }
31161
31162 switch (AM.Scale) {
31163 case 0:
31164 case 1:
31165 case 2:
31166 case 4:
31167 case 8:
31168 // These scales always work.
31169 break;
31170 case 3:
31171 case 5:
31172 case 9:
31173 // These scales are formed with basereg+scalereg. Only accept if there is
31174 // no basereg yet.
31175 if (AM.HasBaseReg)
31176 return false;
31177 break;
31178 default: // Other stuff never works.
31179 return false;
31180 }
31181
31182 return true;
31183}
31184
31185bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
31186 unsigned Bits = Ty->getScalarSizeInBits();
31187
31188 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
31189 // particularly cheaper than those without.
31190 if (Bits == 8)
31191 return false;
31192
31193 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
31194 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
31195 if (Subtarget.hasXOP() &&
31196 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
31197 return false;
31198
31199 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
31200 // shifts just as cheap as scalar ones.
31201 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
31202 return false;
31203
31204 // AVX512BW has shifts such as vpsllvw.
31205 if (Subtarget.hasBWI() && Bits == 16)
31206 return false;
31207
31208 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
31209 // fully general vector.
31210 return true;
31211}
31212
31213bool X86TargetLowering::isBinOp(unsigned Opcode) const {
31214 switch (Opcode) {
31215 // These are non-commutative binops.
31216 // TODO: Add more X86ISD opcodes once we have test coverage.
31217 case X86ISD::ANDNP:
31218 case X86ISD::PCMPGT:
31219 case X86ISD::FMAX:
31220 case X86ISD::FMIN:
31221 case X86ISD::FANDN:
31222 return true;
31223 }
31224
31225 return TargetLoweringBase::isBinOp(Opcode);
31226}
31227
31228bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
31229 switch (Opcode) {
31230 // TODO: Add more X86ISD opcodes once we have test coverage.
31231 case X86ISD::PCMPEQ:
31232 case X86ISD::PMULDQ:
31233 case X86ISD::PMULUDQ:
31234 case X86ISD::FMAXC:
31235 case X86ISD::FMINC:
31236 case X86ISD::FAND:
31237 case X86ISD::FOR:
31238 case X86ISD::FXOR:
31239 return true;
31240 }
31241
31242 return TargetLoweringBase::isCommutativeBinOp(Opcode);
31243}
31244
31245bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
31246 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31247 return false;
31248 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
31249 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
31250 return NumBits1 > NumBits2;
31251}
31252
31253bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
31254 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31255 return false;
31256
31257 if (!isTypeLegal(EVT::getEVT(Ty1)))
31258 return false;
31259
31260 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")((Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"
) ? static_cast<void> (0) : __assert_fail ("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31260, __PRETTY_FUNCTION__))
;
31261
31262 // Assuming the caller doesn't have a zeroext or signext return parameter,
31263 // truncation all the way down to i1 is valid.
31264 return true;
31265}
31266
31267bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
31268 return isInt<32>(Imm);
31269}
31270
31271bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
31272 // Can also use sub to handle negated immediates.
31273 return isInt<32>(Imm);
31274}
31275
31276bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
31277 return isInt<32>(Imm);
31278}
31279
31280bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
31281 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
31282 return false;
31283 unsigned NumBits1 = VT1.getSizeInBits();
31284 unsigned NumBits2 = VT2.getSizeInBits();
31285 return NumBits1 > NumBits2;
31286}
31287
31288bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
31289 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
31290 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
31291}
31292
31293bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
31294 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
31295 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
31296}
31297
31298bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
31299 EVT VT1 = Val.getValueType();
31300 if (isZExtFree(VT1, VT2))
31301 return true;
31302
31303 if (Val.getOpcode() != ISD::LOAD)
31304 return false;
31305
31306 if (!VT1.isSimple() || !VT1.isInteger() ||
31307 !VT2.isSimple() || !VT2.isInteger())
31308 return false;
31309
31310 switch (VT1.getSimpleVT().SimpleTy) {
31311 default: break;
31312 case MVT::i8:
31313 case MVT::i16:
31314 case MVT::i32:
31315 // X86 has 8, 16, and 32-bit zero-extending loads.
31316 return true;
31317 }
31318
31319 return false;
31320}
31321
31322bool X86TargetLowering::shouldSinkOperands(Instruction *I,
31323 SmallVectorImpl<Use *> &Ops) const {
31324 // A uniform shift amount in a vector shift or funnel shift may be much
31325 // cheaper than a generic variable vector shift, so make that pattern visible
31326 // to SDAG by sinking the shuffle instruction next to the shift.
31327 int ShiftAmountOpNum = -1;
31328 if (I->isShift())
31329 ShiftAmountOpNum = 1;
31330 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
31331 if (II->getIntrinsicID() == Intrinsic::fshl ||
31332 II->getIntrinsicID() == Intrinsic::fshr)
31333 ShiftAmountOpNum = 2;
31334 }
31335
31336 if (ShiftAmountOpNum == -1)
31337 return false;
31338
31339 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
31340 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
31341 isVectorShiftByScalarCheap(I->getType())) {
31342 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
31343 return true;
31344 }
31345
31346 return false;
31347}
31348
31349bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
31350 if (!Subtarget.is64Bit())
31351 return false;
31352 return TargetLowering::shouldConvertPhiType(From, To);
31353}
31354
31355bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
31356 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
31357 return false;
31358
31359 EVT SrcVT = ExtVal.getOperand(0).getValueType();
31360
31361 // There is no extending load for vXi1.
31362 if (SrcVT.getScalarType() == MVT::i1)
31363 return false;
31364
31365 return true;
31366}
31367
31368bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
31369 EVT VT) const {
31370 if (!Subtarget.hasAnyFMA())
31371 return false;
31372
31373 VT = VT.getScalarType();
31374
31375 if (!VT.isSimple())
31376 return false;
31377
31378 switch (VT.getSimpleVT().SimpleTy) {
31379 case MVT::f32:
31380 case MVT::f64:
31381 return true;
31382 default:
31383 break;
31384 }
31385
31386 return false;
31387}
31388
31389bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
31390 // i16 instructions are longer (0x66 prefix) and potentially slower.
31391 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
31392}
31393
31394/// Targets can use this to indicate that they only support *some*
31395/// VECTOR_SHUFFLE operations, those with specific masks.
31396/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
31397/// are assumed to be legal.
31398bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
31399 if (!VT.isSimple())
31400 return false;
31401
31402 // Not for i1 vectors
31403 if (VT.getSimpleVT().getScalarType() == MVT::i1)
31404 return false;
31405
31406 // Very little shuffling can be done for 64-bit vectors right now.
31407 if (VT.getSimpleVT().getSizeInBits() == 64)
31408 return false;
31409
31410 // We only care that the types being shuffled are legal. The lowering can
31411 // handle any possible shuffle mask that results.
31412 return isTypeLegal(VT.getSimpleVT());
31413}
31414
31415bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
31416 EVT VT) const {
31417 // Don't convert an 'and' into a shuffle that we don't directly support.
31418 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
31419 if (!Subtarget.hasAVX2())
31420 if (VT == MVT::v32i8 || VT == MVT::v16i16)
31421 return false;
31422
31423 // Just delegate to the generic legality, clear masks aren't special.
31424 return isShuffleMaskLegal(Mask, VT);
31425}
31426
31427bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
31428 // If the subtarget is using thunks, we need to not generate jump tables.
31429 if (Subtarget.useIndirectThunkBranches())
31430 return false;
31431
31432 // Otherwise, fallback on the generic logic.
31433 return TargetLowering::areJTsAllowed(Fn);
31434}
31435
31436//===----------------------------------------------------------------------===//
31437// X86 Scheduler Hooks
31438//===----------------------------------------------------------------------===//
31439
31440// Returns true if EFLAG is consumed after this iterator in the rest of the
31441// basic block or any successors of the basic block.
31442static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
31443 MachineBasicBlock *BB) {
31444 // Scan forward through BB for a use/def of EFLAGS.
31445 for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
31446 miI != miE; ++miI) {
31447 const MachineInstr& mi = *miI;
31448 if (mi.readsRegister(X86::EFLAGS))
31449 return true;
31450 // If we found a def, we can stop searching.
31451 if (mi.definesRegister(X86::EFLAGS))
31452 return false;
31453 }
31454
31455 // If we hit the end of the block, check whether EFLAGS is live into a
31456 // successor.
31457 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
31458 sEnd = BB->succ_end();
31459 sItr != sEnd; ++sItr) {
31460 MachineBasicBlock* succ = *sItr;
31461 if (succ->isLiveIn(X86::EFLAGS))
31462 return true;
31463 }
31464
31465 return false;
31466}
31467
31468/// Utility function to emit xbegin specifying the start of an RTM region.
31469static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
31470 const TargetInstrInfo *TII) {
31471 const DebugLoc &DL = MI.getDebugLoc();
31472
31473 const BasicBlock *BB = MBB->getBasicBlock();
31474 MachineFunction::iterator I = ++MBB->getIterator();
31475
31476 // For the v = xbegin(), we generate
31477 //
31478 // thisMBB:
31479 // xbegin sinkMBB
31480 //
31481 // mainMBB:
31482 // s0 = -1
31483 //
31484 // fallBB:
31485 // eax = # XABORT_DEF
31486 // s1 = eax
31487 //
31488 // sinkMBB:
31489 // v = phi(s0/mainBB, s1/fallBB)
31490
31491 MachineBasicBlock *thisMBB = MBB;
31492 MachineFunction *MF = MBB->getParent();
31493 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
31494 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
31495 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
31496 MF->insert(I, mainMBB);
31497 MF->insert(I, fallMBB);
31498 MF->insert(I, sinkMBB);
31499
31500 if (isEFLAGSLiveAfter(MI, MBB)) {
31501 mainMBB->addLiveIn(X86::EFLAGS);
31502 fallMBB->addLiveIn(X86::EFLAGS);
31503 sinkMBB->addLiveIn(X86::EFLAGS);
31504 }
31505
31506 // Transfer the remainder of BB and its successor edges to sinkMBB.
31507 sinkMBB->splice(sinkMBB->begin(), MBB,
31508 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
31509 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
31510
31511 MachineRegisterInfo &MRI = MF->getRegInfo();
31512 Register DstReg = MI.getOperand(0).getReg();
31513 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
31514 Register mainDstReg = MRI.createVirtualRegister(RC);
31515 Register fallDstReg = MRI.createVirtualRegister(RC);
31516
31517 // thisMBB:
31518 // xbegin fallMBB
31519 // # fallthrough to mainMBB
31520 // # abortion to fallMBB
31521 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
31522 thisMBB->addSuccessor(mainMBB);
31523 thisMBB->addSuccessor(fallMBB);
31524
31525 // mainMBB:
31526 // mainDstReg := -1
31527 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
31528 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
31529 mainMBB->addSuccessor(sinkMBB);
31530
31531 // fallMBB:
31532 // ; pseudo instruction to model hardware's definition from XABORT
31533 // EAX := XABORT_DEF
31534 // fallDstReg := EAX
31535 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
31536 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
31537 .addReg(X86::EAX);
31538 fallMBB->addSuccessor(sinkMBB);
31539
31540 // sinkMBB:
31541 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
31542 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
31543 .addReg(mainDstReg).addMBB(mainMBB)
31544 .addReg(fallDstReg).addMBB(fallMBB);
31545
31546 MI.eraseFromParent();
31547 return sinkMBB;
31548}
31549
31550
31551
31552MachineBasicBlock *
31553X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
31554 MachineBasicBlock *MBB) const {
31555 // Emit va_arg instruction on X86-64.
31556
31557 // Operands to this pseudo-instruction:
31558 // 0 ) Output : destination address (reg)
31559 // 1-5) Input : va_list address (addr, i64mem)
31560 // 6 ) ArgSize : Size (in bytes) of vararg type
31561 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
31562 // 8 ) Align : Alignment of type
31563 // 9 ) EFLAGS (implicit-def)
31564
31565 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!")((MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!"
) ? static_cast<void> (0) : __assert_fail ("MI.getNumOperands() == 10 && \"VAARG_64 should have 10 operands!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31565, __PRETTY_FUNCTION__))
;
31566 static_assert(X86::AddrNumOperands == 5,
31567 "VAARG_64 assumes 5 address operands");
31568
31569 Register DestReg = MI.getOperand(0).getReg();
31570 MachineOperand &Base = MI.getOperand(1);
31571 MachineOperand &Scale = MI.getOperand(2);
31572 MachineOperand &Index = MI.getOperand(3);
31573 MachineOperand &Disp = MI.getOperand(4);
31574 MachineOperand &Segment = MI.getOperand(5);
31575 unsigned ArgSize = MI.getOperand(6).getImm();
31576 unsigned ArgMode = MI.getOperand(7).getImm();
31577 Align Alignment = Align(MI.getOperand(8).getImm());
31578
31579 MachineFunction *MF = MBB->getParent();
31580
31581 // Memory Reference
31582 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand")((MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"
) ? static_cast<void> (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG_64 to have one memoperand\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31582, __PRETTY_FUNCTION__))
;
31583
31584 MachineMemOperand *OldMMO = MI.memoperands().front();
31585
31586 // Clone the MMO into two separate MMOs for loading and storing
31587 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
31588 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
31589 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
31590 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
31591
31592 // Machine Information
31593 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31594 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
31595 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
31596 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
31597 const DebugLoc &DL = MI.getDebugLoc();
31598
31599 // struct va_list {
31600 // i32 gp_offset
31601 // i32 fp_offset
31602 // i64 overflow_area (address)
31603 // i64 reg_save_area (address)
31604 // }
31605 // sizeof(va_list) = 24
31606 // alignment(va_list) = 8
31607
31608 unsigned TotalNumIntRegs = 6;
31609 unsigned TotalNumXMMRegs = 8;
31610 bool UseGPOffset = (ArgMode == 1);
31611 bool UseFPOffset = (ArgMode == 2);
31612 unsigned MaxOffset = TotalNumIntRegs * 8 +
31613 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
31614
31615 /* Align ArgSize to a multiple of 8 */
31616 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
31617 bool NeedsAlign = (Alignment > 8);
31618
31619 MachineBasicBlock *thisMBB = MBB;
31620 MachineBasicBlock *overflowMBB;
31621 MachineBasicBlock *offsetMBB;
31622 MachineBasicBlock *endMBB;
31623
31624 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
31625 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
31626 unsigned OffsetReg = 0;
31627
31628 if (!UseGPOffset && !UseFPOffset) {
31629 // If we only pull from the overflow region, we don't create a branch.
31630 // We don't need to alter control flow.
31631 OffsetDestReg = 0; // unused
31632 OverflowDestReg = DestReg;
31633
31634 offsetMBB = nullptr;
31635 overflowMBB = thisMBB;
31636 endMBB = thisMBB;
31637 } else {
31638 // First emit code to check if gp_offset (or fp_offset) is below the bound.
31639 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
31640 // If not, pull from overflow_area. (branch to overflowMBB)
31641 //
31642 // thisMBB
31643 // | .
31644 // | .
31645 // offsetMBB overflowMBB
31646 // | .
31647 // | .
31648 // endMBB
31649
31650 // Registers for the PHI in endMBB
31651 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
31652 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
31653
31654 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
31655 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31656 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31657 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31658
31659 MachineFunction::iterator MBBIter = ++MBB->getIterator();
31660
31661 // Insert the new basic blocks
31662 MF->insert(MBBIter, offsetMBB);
31663 MF->insert(MBBIter, overflowMBB);
31664 MF->insert(MBBIter, endMBB);
31665
31666 // Transfer the remainder of MBB and its successor edges to endMBB.
31667 endMBB->splice(endMBB->begin(), thisMBB,
31668 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
31669 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
31670
31671 // Make offsetMBB and overflowMBB successors of thisMBB
31672 thisMBB->addSuccessor(offsetMBB);
31673 thisMBB->addSuccessor(overflowMBB);
31674
31675 // endMBB is a successor of both offsetMBB and overflowMBB
31676 offsetMBB->addSuccessor(endMBB);
31677 overflowMBB->addSuccessor(endMBB);
31678
31679 // Load the offset value into a register
31680 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
31681 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
31682 .add(Base)
31683 .add(Scale)
31684 .add(Index)
31685 .addDisp(Disp, UseFPOffset ? 4 : 0)
31686 .add(Segment)
31687 .setMemRefs(LoadOnlyMMO);
31688
31689 // Check if there is enough room left to pull this argument.
31690 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
31691 .addReg(OffsetReg)
31692 .addImm(MaxOffset + 8 - ArgSizeA8);
31693
31694 // Branch to "overflowMBB" if offset >= max
31695 // Fall through to "offsetMBB" otherwise
31696 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
31697 .addMBB(overflowMBB).addImm(X86::COND_AE);
31698 }
31699
31700 // In offsetMBB, emit code to use the reg_save_area.
31701 if (offsetMBB) {
31702 assert(OffsetReg != 0)((OffsetReg != 0) ? static_cast<void> (0) : __assert_fail
("OffsetReg != 0", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31702, __PRETTY_FUNCTION__))
;
31703
31704 // Read the reg_save_area address.
31705 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
31706 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
31707 .add(Base)
31708 .add(Scale)
31709 .add(Index)
31710 .addDisp(Disp, 16)
31711 .add(Segment)
31712 .setMemRefs(LoadOnlyMMO);
31713
31714 // Zero-extend the offset
31715 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
31716 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
31717 .addImm(0)
31718 .addReg(OffsetReg)
31719 .addImm(X86::sub_32bit);
31720
31721 // Add the offset to the reg_save_area to get the final address.
31722 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
31723 .addReg(OffsetReg64)
31724 .addReg(RegSaveReg);
31725
31726 // Compute the offset for the next argument
31727 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
31728 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
31729 .addReg(OffsetReg)
31730 .addImm(UseFPOffset ? 16 : 8);
31731
31732 // Store it back into the va_list.
31733 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
31734 .add(Base)
31735 .add(Scale)
31736 .add(Index)
31737 .addDisp(Disp, UseFPOffset ? 4 : 0)
31738 .add(Segment)
31739 .addReg(NextOffsetReg)
31740 .setMemRefs(StoreOnlyMMO);
31741
31742 // Jump to endMBB
31743 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
31744 .addMBB(endMBB);
31745 }
31746
31747 //
31748 // Emit code to use overflow area
31749 //
31750
31751 // Load the overflow_area address into a register.
31752 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
31753 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
31754 .add(Base)
31755 .add(Scale)
31756 .add(Index)
31757 .addDisp(Disp, 8)
31758 .add(Segment)
31759 .setMemRefs(LoadOnlyMMO);
31760
31761 // If we need to align it, do so. Otherwise, just copy the address
31762 // to OverflowDestReg.
31763 if (NeedsAlign) {
31764 // Align the overflow address
31765 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
31766
31767 // aligned_addr = (addr + (align-1)) & ~(align-1)
31768 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
31769 .addReg(OverflowAddrReg)
31770 .addImm(Alignment.value() - 1);
31771
31772 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
31773 .addReg(TmpReg)
31774 .addImm(~(uint64_t)(Alignment.value() - 1));
31775 } else {
31776 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
31777 .addReg(OverflowAddrReg);
31778 }
31779
31780 // Compute the next overflow address after this argument.
31781 // (the overflow address should be kept 8-byte aligned)
31782 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
31783 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
31784 .addReg(OverflowDestReg)
31785 .addImm(ArgSizeA8);
31786
31787 // Store the new overflow address.
31788 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
31789 .add(Base)
31790 .add(Scale)
31791 .add(Index)
31792 .addDisp(Disp, 8)
31793 .add(Segment)
31794 .addReg(NextAddrReg)
31795 .setMemRefs(StoreOnlyMMO);
31796
31797 // If we branched, emit the PHI to the front of endMBB.
31798 if (offsetMBB) {
31799 BuildMI(*endMBB, endMBB->begin(), DL,
31800 TII->get(X86::PHI), DestReg)
31801 .addReg(OffsetDestReg).addMBB(offsetMBB)
31802 .addReg(OverflowDestReg).addMBB(overflowMBB);
31803 }
31804
31805 // Erase the pseudo instruction
31806 MI.eraseFromParent();
31807
31808 return endMBB;
31809}
31810
31811MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
31812 MachineInstr &MI, MachineBasicBlock *MBB) const {
31813 // Emit code to save XMM registers to the stack. The ABI says that the
31814 // number of registers to save is given in %al, so it's theoretically
31815 // possible to do an indirect jump trick to avoid saving all of them,
31816 // however this code takes a simpler approach and just executes all
31817 // of the stores if %al is non-zero. It's less code, and it's probably
31818 // easier on the hardware branch predictor, and stores aren't all that
31819 // expensive anyway.
31820
31821 // Create the new basic blocks. One block contains all the XMM stores,
31822 // and one block is the final destination regardless of whether any
31823 // stores were performed.
31824 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
31825 MachineFunction *F = MBB->getParent();
31826 MachineFunction::iterator MBBIter = ++MBB->getIterator();
31827 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
31828 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
31829 F->insert(MBBIter, XMMSaveMBB);
31830 F->insert(MBBIter, EndMBB);
31831
31832 // Transfer the remainder of MBB and its successor edges to EndMBB.
31833 EndMBB->splice(EndMBB->begin(), MBB,
31834 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
31835 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
31836
31837 // The original block will now fall through to the XMM save block.
31838 MBB->addSuccessor(XMMSaveMBB);
31839 // The XMMSaveMBB will fall through to the end block.
31840 XMMSaveMBB->addSuccessor(EndMBB);
31841
31842 // Now add the instructions.
31843 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31844 const DebugLoc &DL = MI.getDebugLoc();
31845
31846 Register CountReg = MI.getOperand(0).getReg();
31847 int RegSaveFrameIndex = MI.getOperand(1).getImm();
31848 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
31849
31850 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
31851 // If %al is 0, branch around the XMM save block.
31852 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
31853 BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
31854 MBB->addSuccessor(EndMBB);
31855 }
31856
31857 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
31858 // that was just emitted, but clearly shouldn't be "saved".
31859 assert((MI.getNumOperands() <= 3 ||(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31862, __PRETTY_FUNCTION__))
31860 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31862, __PRETTY_FUNCTION__))
31861 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31862, __PRETTY_FUNCTION__))
31862 "Expected last argument to be EFLAGS")(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31862, __PRETTY_FUNCTION__))
;
31863 unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
31864 // In the XMM save block, save all the XMM argument registers.
31865 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
31866 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
31867 MachineMemOperand *MMO = F->getMachineMemOperand(
31868 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
31869 MachineMemOperand::MOStore,
31870 /*Size=*/16, Align(16));
31871 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
31872 .addFrameIndex(RegSaveFrameIndex)
31873 .addImm(/*Scale=*/1)
31874 .addReg(/*IndexReg=*/0)
31875 .addImm(/*Disp=*/Offset)
31876 .addReg(/*Segment=*/0)
31877 .addReg(MI.getOperand(i).getReg())
31878 .addMemOperand(MMO);
31879 }
31880
31881 MI.eraseFromParent(); // The pseudo instruction is gone now.
31882
31883 return EndMBB;
31884}
31885
31886// The EFLAGS operand of SelectItr might be missing a kill marker
31887// because there were multiple uses of EFLAGS, and ISel didn't know
31888// which to mark. Figure out whether SelectItr should have had a
31889// kill marker, and set it if it should. Returns the correct kill
31890// marker value.
31891static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
31892 MachineBasicBlock* BB,
31893 const TargetRegisterInfo* TRI) {
31894 if (isEFLAGSLiveAfter(SelectItr, BB))
31895 return false;
31896
31897 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
31898 // out. SelectMI should have a kill flag on EFLAGS.
31899 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
31900 return true;
31901}
31902
31903// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
31904// together with other CMOV pseudo-opcodes into a single basic-block with
31905// conditional jump around it.
31906static bool isCMOVPseudo(MachineInstr &MI) {
31907 switch (MI.getOpcode()) {
31908 case X86::CMOV_FR32:
31909 case X86::CMOV_FR32X:
31910 case X86::CMOV_FR64:
31911 case X86::CMOV_FR64X:
31912 case X86::CMOV_GR8:
31913 case X86::CMOV_GR16:
31914 case X86::CMOV_GR32:
31915 case X86::CMOV_RFP32:
31916 case X86::CMOV_RFP64:
31917 case X86::CMOV_RFP80:
31918 case X86::CMOV_VR64:
31919 case X86::CMOV_VR128:
31920 case X86::CMOV_VR128X:
31921 case X86::CMOV_VR256:
31922 case X86::CMOV_VR256X:
31923 case X86::CMOV_VR512:
31924 case X86::CMOV_VK1:
31925 case X86::CMOV_VK2:
31926 case X86::CMOV_VK4:
31927 case X86::CMOV_VK8:
31928 case X86::CMOV_VK16:
31929 case X86::CMOV_VK32:
31930 case X86::CMOV_VK64:
31931 return true;
31932
31933 default:
31934 return false;
31935 }
31936}
31937
31938// Helper function, which inserts PHI functions into SinkMBB:
31939// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
31940// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
31941// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
31942// the last PHI function inserted.
31943static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
31944 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
31945 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
31946 MachineBasicBlock *SinkMBB) {
31947 MachineFunction *MF = TrueMBB->getParent();
31948 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
31949 DebugLoc DL = MIItBegin->getDebugLoc();
31950
31951 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
31952 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
31953
31954 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
31955
31956 // As we are creating the PHIs, we have to be careful if there is more than
31957 // one. Later CMOVs may reference the results of earlier CMOVs, but later
31958 // PHIs have to reference the individual true/false inputs from earlier PHIs.
31959 // That also means that PHI construction must work forward from earlier to
31960 // later, and that the code must maintain a mapping from earlier PHI's
31961 // destination registers, and the registers that went into the PHI.
31962 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
31963 MachineInstrBuilder MIB;
31964
31965 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
31966 Register DestReg = MIIt->getOperand(0).getReg();
31967 Register Op1Reg = MIIt->getOperand(1).getReg();
31968 Register Op2Reg = MIIt->getOperand(2).getReg();
31969
31970 // If this CMOV we are generating is the opposite condition from
31971 // the jump we generated, then we have to swap the operands for the
31972 // PHI that is going to be generated.
31973 if (MIIt->getOperand(3).getImm() == OppCC)
31974 std::swap(Op1Reg, Op2Reg);
31975
31976 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
31977 Op1Reg = RegRewriteTable[Op1Reg].first;
31978
31979 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
31980 Op2Reg = RegRewriteTable[Op2Reg].second;
31981
31982 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
31983 .addReg(Op1Reg)
31984 .addMBB(FalseMBB)
31985 .addReg(Op2Reg)
31986 .addMBB(TrueMBB);
31987
31988 // Add this PHI to the rewrite table.
31989 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
31990 }
31991
31992 return MIB;
31993}
31994
31995// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
31996MachineBasicBlock *
31997X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
31998 MachineInstr &SecondCascadedCMOV,
31999 MachineBasicBlock *ThisMBB) const {
32000 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32001 DebugLoc DL = FirstCMOV.getDebugLoc();
32002
32003 // We lower cascaded CMOVs such as
32004 //
32005 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
32006 //
32007 // to two successive branches.
32008 //
32009 // Without this, we would add a PHI between the two jumps, which ends up
32010 // creating a few copies all around. For instance, for
32011 //
32012 // (sitofp (zext (fcmp une)))
32013 //
32014 // we would generate:
32015 //
32016 // ucomiss %xmm1, %xmm0
32017 // movss <1.0f>, %xmm0
32018 // movaps %xmm0, %xmm1
32019 // jne .LBB5_2
32020 // xorps %xmm1, %xmm1
32021 // .LBB5_2:
32022 // jp .LBB5_4
32023 // movaps %xmm1, %xmm0
32024 // .LBB5_4:
32025 // retq
32026 //
32027 // because this custom-inserter would have generated:
32028 //
32029 // A
32030 // | \
32031 // | B
32032 // | /
32033 // C
32034 // | \
32035 // | D
32036 // | /
32037 // E
32038 //
32039 // A: X = ...; Y = ...
32040 // B: empty
32041 // C: Z = PHI [X, A], [Y, B]
32042 // D: empty
32043 // E: PHI [X, C], [Z, D]
32044 //
32045 // If we lower both CMOVs in a single step, we can instead generate:
32046 //
32047 // A
32048 // | \
32049 // | C
32050 // | /|
32051 // |/ |
32052 // | |
32053 // | D
32054 // | /
32055 // E
32056 //
32057 // A: X = ...; Y = ...
32058 // D: empty
32059 // E: PHI [X, A], [X, C], [Y, D]
32060 //
32061 // Which, in our sitofp/fcmp example, gives us something like:
32062 //
32063 // ucomiss %xmm1, %xmm0
32064 // movss <1.0f>, %xmm0
32065 // jne .LBB5_4
32066 // jp .LBB5_4
32067 // xorps %xmm0, %xmm0
32068 // .LBB5_4:
32069 // retq
32070 //
32071
32072 // We lower cascaded CMOV into two successive branches to the same block.
32073 // EFLAGS is used by both, so mark it as live in the second.
32074 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32075 MachineFunction *F = ThisMBB->getParent();
32076 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32077 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32078 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32079
32080 MachineFunction::iterator It = ++ThisMBB->getIterator();
32081 F->insert(It, FirstInsertedMBB);
32082 F->insert(It, SecondInsertedMBB);
32083 F->insert(It, SinkMBB);
32084
32085 // For a cascaded CMOV, we lower it to two successive branches to
32086 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
32087 // the FirstInsertedMBB.
32088 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
32089
32090 // If the EFLAGS register isn't dead in the terminator, then claim that it's
32091 // live into the sink and copy blocks.
32092 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32093 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
32094 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
32095 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
32096 SinkMBB->addLiveIn(X86::EFLAGS);
32097 }
32098
32099 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32100 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
32101 std::next(MachineBasicBlock::iterator(FirstCMOV)),
32102 ThisMBB->end());
32103 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32104
32105 // Fallthrough block for ThisMBB.
32106 ThisMBB->addSuccessor(FirstInsertedMBB);
32107 // The true block target of the first branch is always SinkMBB.
32108 ThisMBB->addSuccessor(SinkMBB);
32109 // Fallthrough block for FirstInsertedMBB.
32110 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
32111 // The true block for the branch of FirstInsertedMBB.
32112 FirstInsertedMBB->addSuccessor(SinkMBB);
32113 // This is fallthrough.
32114 SecondInsertedMBB->addSuccessor(SinkMBB);
32115
32116 // Create the conditional branch instructions.
32117 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
32118 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
32119
32120 X86::CondCode SecondCC =
32121 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
32122 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
32123
32124 // SinkMBB:
32125 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
32126 Register DestReg = FirstCMOV.getOperand(0).getReg();
32127 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
32128 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
32129 MachineInstrBuilder MIB =
32130 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
32131 .addReg(Op1Reg)
32132 .addMBB(SecondInsertedMBB)
32133 .addReg(Op2Reg)
32134 .addMBB(ThisMBB);
32135
32136 // The second SecondInsertedMBB provides the same incoming value as the
32137 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
32138 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
32139 // Copy the PHI result to the register defined by the second CMOV.
32140 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
32141 TII->get(TargetOpcode::COPY),
32142 SecondCascadedCMOV.getOperand(0).getReg())
32143 .addReg(FirstCMOV.getOperand(0).getReg());
32144
32145 // Now remove the CMOVs.
32146 FirstCMOV.eraseFromParent();
32147 SecondCascadedCMOV.eraseFromParent();
32148
32149 return SinkMBB;
32150}
32151
32152MachineBasicBlock *
32153X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
32154 MachineBasicBlock *ThisMBB) const {
32155 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32156 const DebugLoc &DL = MI.getDebugLoc();
32157
32158 // To "insert" a SELECT_CC instruction, we actually have to insert the
32159 // diamond control-flow pattern. The incoming instruction knows the
32160 // destination vreg to set, the condition code register to branch on, the
32161 // true/false values to select between and a branch opcode to use.
32162
32163 // ThisMBB:
32164 // ...
32165 // TrueVal = ...
32166 // cmpTY ccX, r1, r2
32167 // bCC copy1MBB
32168 // fallthrough --> FalseMBB
32169
32170 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
32171 // as described above, by inserting a BB, and then making a PHI at the join
32172 // point to select the true and false operands of the CMOV in the PHI.
32173 //
32174 // The code also handles two different cases of multiple CMOV opcodes
32175 // in a row.
32176 //
32177 // Case 1:
32178 // In this case, there are multiple CMOVs in a row, all which are based on
32179 // the same condition setting (or the exact opposite condition setting).
32180 // In this case we can lower all the CMOVs using a single inserted BB, and
32181 // then make a number of PHIs at the join point to model the CMOVs. The only
32182 // trickiness here, is that in a case like:
32183 //
32184 // t2 = CMOV cond1 t1, f1
32185 // t3 = CMOV cond1 t2, f2
32186 //
32187 // when rewriting this into PHIs, we have to perform some renaming on the
32188 // temps since you cannot have a PHI operand refer to a PHI result earlier
32189 // in the same block. The "simple" but wrong lowering would be:
32190 //
32191 // t2 = PHI t1(BB1), f1(BB2)
32192 // t3 = PHI t2(BB1), f2(BB2)
32193 //
32194 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
32195 // renaming is to note that on the path through BB1, t2 is really just a
32196 // copy of t1, and do that renaming, properly generating:
32197 //
32198 // t2 = PHI t1(BB1), f1(BB2)
32199 // t3 = PHI t1(BB1), f2(BB2)
32200 //
32201 // Case 2:
32202 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
32203 // function - EmitLoweredCascadedSelect.
32204
32205 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
32206 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32207 MachineInstr *LastCMOV = &MI;
32208 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
32209
32210 // Check for case 1, where there are multiple CMOVs with the same condition
32211 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
32212 // number of jumps the most.
32213
32214 if (isCMOVPseudo(MI)) {
32215 // See if we have a string of CMOVS with the same condition. Skip over
32216 // intervening debug insts.
32217 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
32218 (NextMIIt->getOperand(3).getImm() == CC ||
32219 NextMIIt->getOperand(3).getImm() == OppCC)) {
32220 LastCMOV = &*NextMIIt;
32221 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
32222 }
32223 }
32224
32225 // This checks for case 2, but only do this if we didn't already find
32226 // case 1, as indicated by LastCMOV == MI.
32227 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
32228 NextMIIt->getOpcode() == MI.getOpcode() &&
32229 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
32230 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
32231 NextMIIt->getOperand(1).isKill()) {
32232 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
32233 }
32234
32235 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32236 MachineFunction *F = ThisMBB->getParent();
32237 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
32238 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32239
32240 MachineFunction::iterator It = ++ThisMBB->getIterator();
32241 F->insert(It, FalseMBB);
32242 F->insert(It, SinkMBB);
32243
32244 // If the EFLAGS register isn't dead in the terminator, then claim that it's
32245 // live into the sink and copy blocks.
32246 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32247 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
32248 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
32249 FalseMBB->addLiveIn(X86::EFLAGS);
32250 SinkMBB->addLiveIn(X86::EFLAGS);
32251 }
32252
32253 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
32254 auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
32255 auto DbgIt = MachineBasicBlock::iterator(MI);
32256 while (DbgIt != DbgEnd) {
32257 auto Next = std::next(DbgIt);
32258 if (DbgIt->isDebugInstr())
32259 SinkMBB->push_back(DbgIt->removeFromParent());
32260 DbgIt = Next;
32261 }
32262
32263 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32264 SinkMBB->splice(SinkMBB->end(), ThisMBB,
32265 std::next(MachineBasicBlock::iterator(LastCMOV)),
32266 ThisMBB->end());
32267 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32268
32269 // Fallthrough block for ThisMBB.
32270 ThisMBB->addSuccessor(FalseMBB);
32271 // The true block target of the first (or only) branch is always a SinkMBB.
32272 ThisMBB->addSuccessor(SinkMBB);
32273 // Fallthrough block for FalseMBB.
32274 FalseMBB->addSuccessor(SinkMBB);
32275
32276 // Create the conditional branch instruction.
32277 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
32278
32279 // SinkMBB:
32280 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
32281 // ...
32282 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
32283 MachineBasicBlock::iterator MIItEnd =
32284 std::next(MachineBasicBlock::iterator(LastCMOV));
32285 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
32286
32287 // Now remove the CMOV(s).
32288 ThisMBB->erase(MIItBegin, MIItEnd);
32289
32290 return SinkMBB;
32291}
32292
32293static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
32294 if (IsLP64) {
32295 if (isInt<8>(Imm))
32296 return X86::SUB64ri8;
32297 return X86::SUB64ri32;
32298 } else {
32299 if (isInt<8>(Imm))
32300 return X86::SUB32ri8;
32301 return X86::SUB32ri;
32302 }
32303}
32304
32305MachineBasicBlock *
32306X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
32307 MachineBasicBlock *MBB) const {
32308 MachineFunction *MF = MBB->getParent();
32309 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32310 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
32311 const DebugLoc &DL = MI.getDebugLoc();
32312 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32313
32314 const unsigned ProbeSize = getStackProbeSize(*MF);
32315
32316 MachineRegisterInfo &MRI = MF->getRegInfo();
32317 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32318 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32319 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32320
32321 MachineFunction::iterator MBBIter = ++MBB->getIterator();
32322 MF->insert(MBBIter, testMBB);
32323 MF->insert(MBBIter, blockMBB);
32324 MF->insert(MBBIter, tailMBB);
32325
32326 Register sizeVReg = MI.getOperand(1).getReg();
32327
32328 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
32329
32330 Register TmpStackPtr = MRI.createVirtualRegister(
32331 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
32332 Register FinalStackPtr = MRI.createVirtualRegister(
32333 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
32334
32335 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
32336 .addReg(physSPReg);
32337 {
32338 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
32339 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
32340 .addReg(TmpStackPtr)
32341 .addReg(sizeVReg);
32342 }
32343
32344 // test rsp size
32345
32346 BuildMI(testMBB, DL,
32347 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
32348 .addReg(FinalStackPtr)
32349 .addReg(physSPReg);
32350
32351 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
32352 .addMBB(tailMBB)
32353 .addImm(X86::COND_GE);
32354 testMBB->addSuccessor(blockMBB);
32355 testMBB->addSuccessor(tailMBB);
32356
32357 // Touch the block then extend it. This is done on the opposite side of
32358 // static probe where we allocate then touch, to avoid the need of probing the
32359 // tail of the static alloca. Possible scenarios are:
32360 //
32361 // + ---- <- ------------ <- ------------- <- ------------ +
32362 // | |
32363 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
32364 // | |
32365 // + <- ----------- <- ------------ <- ----------- <- ------------ +
32366 //
32367 // The property we want to enforce is to never have more than [page alloc] between two probes.
32368
32369 const unsigned XORMIOpc =
32370 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
32371 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
32372 .addImm(0);
32373
32374 BuildMI(blockMBB, DL,
32375 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
32376 .addReg(physSPReg)
32377 .addImm(ProbeSize);
32378
32379
32380 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
32381 blockMBB->addSuccessor(testMBB);
32382
32383 // Replace original instruction by the expected stack ptr
32384 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
32385 .addReg(FinalStackPtr);
32386
32387 tailMBB->splice(tailMBB->end(), MBB,
32388 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32389 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
32390 MBB->addSuccessor(testMBB);
32391
32392 // Delete the original pseudo instruction.
32393 MI.eraseFromParent();
32394
32395 // And we're done.
32396 return tailMBB;
32397}
32398
32399MachineBasicBlock *
32400X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
32401 MachineBasicBlock *BB) const {
32402 MachineFunction *MF = BB->getParent();
32403 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32404 const DebugLoc &DL = MI.getDebugLoc();
32405 const BasicBlock *LLVM_BB = BB->getBasicBlock();
32406
32407 assert(MF->shouldSplitStack())((MF->shouldSplitStack()) ? static_cast<void> (0) : __assert_fail
("MF->shouldSplitStack()", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32407, __PRETTY_FUNCTION__))
;
32408
32409 const bool Is64Bit = Subtarget.is64Bit();
32410 const bool IsLP64 = Subtarget.isTarget64BitLP64();
32411
32412 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
32413 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
32414
32415 // BB:
32416 // ... [Till the alloca]
32417 // If stacklet is not large enough, jump to mallocMBB
32418 //
32419 // bumpMBB:
32420 // Allocate by subtracting from RSP
32421 // Jump to continueMBB
32422 //
32423 // mallocMBB:
32424 // Allocate by call to runtime
32425 //
32426 // continueMBB:
32427 // ...
32428 // [rest of original BB]
32429 //
32430
32431 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32432 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32433 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32434
32435 MachineRegisterInfo &MRI = MF->getRegInfo();
32436 const TargetRegisterClass *AddrRegClass =
32437 getRegClassFor(getPointerTy(MF->getDataLayout()));
32438
32439 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
32440 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
32441 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
32442 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
32443 sizeVReg = MI.getOperand(1).getReg(),
32444 physSPReg =
32445 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
32446
32447 MachineFunction::iterator MBBIter = ++BB->getIterator();
32448
32449 MF->insert(MBBIter, bumpMBB);
32450 MF->insert(MBBIter, mallocMBB);
32451 MF->insert(MBBIter, continueMBB);
32452
32453 continueMBB->splice(continueMBB->begin(), BB,
32454 std::next(MachineBasicBlock::iterator(MI)), BB->end());
32455 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
32456
32457 // Add code to the main basic block to check if the stack limit has been hit,
32458 // and if so, jump to mallocMBB otherwise to bumpMBB.
32459 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
32460 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
32461 .addReg(tmpSPVReg).addReg(sizeVReg);
32462 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
32463 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
32464 .addReg(SPLimitVReg);
32465 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
32466
32467 // bumpMBB simply decreases the stack pointer, since we know the current
32468 // stacklet has enough space.
32469 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
32470 .addReg(SPLimitVReg);
32471 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
32472 .addReg(SPLimitVReg);
32473 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
32474
32475 // Calls into a routine in libgcc to allocate more space from the heap.
32476 const uint32_t *RegMask =
32477 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
32478 if (IsLP64) {
32479 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
32480 .addReg(sizeVReg);
32481 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
32482 .addExternalSymbol("__morestack_allocate_stack_space")
32483 .addRegMask(RegMask)
32484 .addReg(X86::RDI, RegState::Implicit)
32485 .addReg(X86::RAX, RegState::ImplicitDefine);
32486 } else if (Is64Bit) {
32487 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
32488 .addReg(sizeVReg);
32489 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
32490 .addExternalSymbol("__morestack_allocate_stack_space")
32491 .addRegMask(RegMask)
32492 .addReg(X86::EDI, RegState::Implicit)
32493 .addReg(X86::EAX, RegState::ImplicitDefine);
32494 } else {
32495 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
32496 .addImm(12);
32497 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
32498 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
32499 .addExternalSymbol("__morestack_allocate_stack_space")
32500 .addRegMask(RegMask)
32501 .addReg(X86::EAX, RegState::ImplicitDefine);
32502 }
32503
32504 if (!Is64Bit)
32505 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
32506 .addImm(16);
32507
32508 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
32509 .addReg(IsLP64 ? X86::RAX : X86::EAX);
32510 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
32511
32512 // Set up the CFG correctly.
32513 BB->addSuccessor(bumpMBB);
32514 BB->addSuccessor(mallocMBB);
32515 mallocMBB->addSuccessor(continueMBB);
32516 bumpMBB->addSuccessor(continueMBB);
32517
32518 // Take care of the PHI nodes.
32519 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
32520 MI.getOperand(0).getReg())
32521 .addReg(mallocPtrVReg)
32522 .addMBB(mallocMBB)
32523 .addReg(bumpSPPtrVReg)
32524 .addMBB(bumpMBB);
32525
32526 // Delete the original pseudo instruction.
32527 MI.eraseFromParent();
32528
32529 // And we're done.
32530 return continueMBB;
32531}
32532
32533MachineBasicBlock *
32534X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
32535 MachineBasicBlock *BB) const {
32536 MachineFunction *MF = BB->getParent();
32537 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
32538 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
32539 const DebugLoc &DL = MI.getDebugLoc();
32540
32541 assert(!isAsynchronousEHPersonality(((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32543, __PRETTY_FUNCTION__))
32542 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32543, __PRETTY_FUNCTION__))
32543 "SEH does not use catchret!")((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32543, __PRETTY_FUNCTION__))
;
32544
32545 // Only 32-bit EH needs to worry about manually restoring stack pointers.
32546 if (!Subtarget.is32Bit())
32547 return BB;
32548
32549 // C++ EH creates a new target block to hold the restore code, and wires up
32550 // the new block to the return destination with a normal JMP_4.
32551 MachineBasicBlock *RestoreMBB =
32552 MF->CreateMachineBasicBlock(BB->getBasicBlock());
32553 assert(BB->succ_size() == 1)((BB->succ_size() == 1) ? static_cast<void> (0) : __assert_fail
("BB->succ_size() == 1", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32553, __PRETTY_FUNCTION__))
;
32554 MF->insert(std::next(BB->getIterator()), RestoreMBB);
32555 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
32556 BB->addSuccessor(RestoreMBB);
32557 MI.getOperand(0).setMBB(RestoreMBB);
32558
32559 // Marking this as an EH pad but not a funclet entry block causes PEI to
32560 // restore stack pointers in the block.
32561 RestoreMBB->setIsEHPad(true);
32562
32563 auto RestoreMBBI = RestoreMBB->begin();
32564 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
32565 return BB;
32566}
32567
32568MachineBasicBlock *
32569X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
32570 MachineBasicBlock *BB) const {
32571 // So, here we replace TLSADDR with the sequence:
32572 // adjust_stackdown -> TLSADDR -> adjust_stackup.
32573 // We need this because TLSADDR is lowered into calls
32574 // inside MC, therefore without the two markers shrink-wrapping
32575 // may push the prologue/epilogue pass them.
32576 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
32577 const DebugLoc &DL = MI.getDebugLoc();
32578 MachineFunction &MF = *BB->getParent();
32579
32580 // Emit CALLSEQ_START right before the instruction.
32581 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
32582 MachineInstrBuilder CallseqStart =
32583 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
32584 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
32585
32586 // Emit CALLSEQ_END right after the instruction.
32587 // We don't call erase from parent because we want to keep the
32588 // original instruction around.
32589 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
32590 MachineInstrBuilder CallseqEnd =
32591 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
32592 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
32593
32594 return BB;
32595}
32596
32597MachineBasicBlock *
32598X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
32599 MachineBasicBlock *BB) const {
32600 // This is pretty easy. We're taking the value that we received from
32601 // our load from the relocation, sticking it in either RDI (x86-64)
32602 // or EAX and doing an indirect call. The return value will then
32603 // be in the normal return register.
32604 MachineFunction *F = BB->getParent();
32605 const X86InstrInfo *TII = Subtarget.getInstrInfo();
32606 const DebugLoc &DL = MI.getDebugLoc();
32607
32608 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")((Subtarget.isTargetDarwin() && "Darwin only instr emitted?"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32608, __PRETTY_FUNCTION__))
;
32609 assert(MI.getOperand(3).isGlobal() && "This should be a global")((MI.getOperand(3).isGlobal() && "This should be a global"
) ? static_cast<void> (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32609, __PRETTY_FUNCTION__))
;
32610
32611 // Get a register mask for the lowered call.
32612 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
32613 // proper register mask.
32614 const uint32_t *RegMask =
32615 Subtarget.is64Bit() ?
32616 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
32617 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
32618 if (Subtarget.is64Bit()) {
32619 MachineInstrBuilder MIB =
32620 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
32621 .addReg(X86::RIP)
32622 .addImm(0)
32623 .addReg(0)
32624 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
32625 MI.getOperand(3).getTargetFlags())
32626 .addReg(0);
32627 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
32628 addDirectMem(MIB, X86::RDI);
32629 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
32630 } else if (!isPositionIndependent()) {
32631 MachineInstrBuilder MIB =
32632 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
32633 .addReg(0)
32634 .addImm(0)
32635 .addReg(0)
32636 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
32637 MI.getOperand(3).getTargetFlags())
32638 .addReg(0);
32639 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
32640 addDirectMem(MIB, X86::EAX);
32641 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
32642 } else {
32643 MachineInstrBuilder MIB =
32644 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
32645 .addReg(TII->getGlobalBaseReg(F))
32646 .addImm(0)
32647 .addReg(0)
32648 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
32649 MI.getOperand(3).getTargetFlags())
32650 .addReg(0);
32651 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
32652 addDirectMem(MIB, X86::EAX);
32653 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
32654 }
32655
32656 MI.eraseFromParent(); // The pseudo instruction is gone now.
32657 return BB;
32658}
32659
32660static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
32661 switch (RPOpc) {
32662 case X86::INDIRECT_THUNK_CALL32:
32663 return X86::CALLpcrel32;
32664 case X86::INDIRECT_THUNK_CALL64:
32665 return X86::CALL64pcrel32;
32666 case X86::INDIRECT_THUNK_TCRETURN32:
32667 return X86::TCRETURNdi;
32668 case X86::INDIRECT_THUNK_TCRETURN64:
32669 return X86::TCRETURNdi64;
32670 }
32671 llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32671)
;
32672}
32673
32674static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
32675 unsigned Reg) {
32676 if (Subtarget.useRetpolineExternalThunk()) {
32677 // When using an external thunk for retpolines, we pick names that match the
32678 // names GCC happens to use as well. This helps simplify the implementation
32679 // of the thunks for kernels where they have no easy ability to create
32680 // aliases and are doing non-trivial configuration of the thunk's body. For
32681 // example, the Linux kernel will do boot-time hot patching of the thunk
32682 // bodies and cannot easily export aliases of these to loaded modules.
32683 //
32684 // Note that at any point in the future, we may need to change the semantics
32685 // of how we implement retpolines and at that time will likely change the
32686 // name of the called thunk. Essentially, there is no hard guarantee that
32687 // LLVM will generate calls to specific thunks, we merely make a best-effort
32688 // attempt to help out kernels and other systems where duplicating the
32689 // thunks is costly.
32690 switch (Reg) {
32691 case X86::EAX:
32692 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32692, __PRETTY_FUNCTION__))
;
32693 return "__x86_indirect_thunk_eax";
32694 case X86::ECX:
32695 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32695, __PRETTY_FUNCTION__))
;
32696 return "__x86_indirect_thunk_ecx";
32697 case X86::EDX:
32698 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32698, __PRETTY_FUNCTION__))
;
32699 return "__x86_indirect_thunk_edx";
32700 case X86::EDI:
32701 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32701, __PRETTY_FUNCTION__))
;
32702 return "__x86_indirect_thunk_edi";
32703 case X86::R11:
32704 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32704, __PRETTY_FUNCTION__))
;
32705 return "__x86_indirect_thunk_r11";
32706 }
32707 llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32707)
;
32708 }
32709
32710 if (Subtarget.useRetpolineIndirectCalls() ||
32711 Subtarget.useRetpolineIndirectBranches()) {
32712 // When targeting an internal COMDAT thunk use an LLVM-specific name.
32713 switch (Reg) {
32714 case X86::EAX:
32715 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32715, __PRETTY_FUNCTION__))
;
32716 return "__llvm_retpoline_eax";
32717 case X86::ECX:
32718 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32718, __PRETTY_FUNCTION__))
;
32719 return "__llvm_retpoline_ecx";
32720 case X86::EDX:
32721 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32721, __PRETTY_FUNCTION__))
;
32722 return "__llvm_retpoline_edx";
32723 case X86::EDI:
32724 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32724, __PRETTY_FUNCTION__))
;
32725 return "__llvm_retpoline_edi";
32726 case X86::R11:
32727 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32727, __PRETTY_FUNCTION__))
;
32728 return "__llvm_retpoline_r11";
32729 }
32730 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32730)
;
32731 }
32732
32733 if (Subtarget.useLVIControlFlowIntegrity()) {
32734 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32734, __PRETTY_FUNCTION__))
;
32735 return "__llvm_lvi_thunk_r11";
32736 }
32737 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32737)
;
32738}
32739
32740MachineBasicBlock *
32741X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
32742 MachineBasicBlock *BB) const {
32743 // Copy the virtual register into the R11 physical register and
32744 // call the retpoline thunk.
32745 const DebugLoc &DL = MI.getDebugLoc();
32746 const X86InstrInfo *TII = Subtarget.getInstrInfo();
32747 Register CalleeVReg = MI.getOperand(0).getReg();
32748 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
32749
32750 // Find an available scratch register to hold the callee. On 64-bit, we can
32751 // just use R11, but we scan for uses anyway to ensure we don't generate
32752 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
32753 // already a register use operand to the call to hold the callee. If none
32754 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
32755 // register and ESI is the base pointer to realigned stack frames with VLAs.
32756 SmallVector<unsigned, 3> AvailableRegs;
32757 if (Subtarget.is64Bit())
32758 AvailableRegs.push_back(X86::R11);
32759 else
32760 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
32761
32762 // Zero out any registers that are already used.
32763 for (const auto &MO : MI.operands()) {
32764 if (MO.isReg() && MO.isUse())
32765 for (unsigned &Reg : AvailableRegs)
32766 if (Reg == MO.getReg())
32767 Reg = 0;
32768 }
32769
32770 // Choose the first remaining non-zero available register.
32771 unsigned AvailableReg = 0;
32772 for (unsigned MaybeReg : AvailableRegs) {
32773 if (MaybeReg) {
32774 AvailableReg = MaybeReg;
32775 break;
32776 }
32777 }
32778 if (!AvailableReg)
32779 report_fatal_error("calling convention incompatible with retpoline, no "
32780 "available registers");
32781
32782 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
32783
32784 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
32785 .addReg(CalleeVReg);
32786 MI.getOperand(0).ChangeToES(Symbol);
32787 MI.setDesc(TII->get(Opc));
32788 MachineInstrBuilder(*BB->getParent(), &MI)
32789 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
32790 return BB;
32791}
32792
32793/// SetJmp implies future control flow change upon calling the corresponding
32794/// LongJmp.
32795/// Instead of using the 'return' instruction, the long jump fixes the stack and
32796/// performs an indirect branch. To do so it uses the registers that were stored
32797/// in the jump buffer (when calling SetJmp).
32798/// In case the shadow stack is enabled we need to fix it as well, because some
32799/// return addresses will be skipped.
32800/// The function will save the SSP for future fixing in the function
32801/// emitLongJmpShadowStackFix.
32802/// \sa emitLongJmpShadowStackFix
32803/// \param [in] MI The temporary Machine Instruction for the builtin.
32804/// \param [in] MBB The Machine Basic Block that will be modified.
32805void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
32806 MachineBasicBlock *MBB) const {
32807 const DebugLoc &DL = MI.getDebugLoc();
32808 MachineFunction *MF = MBB->getParent();
32809 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32810 MachineRegisterInfo &MRI = MF->getRegInfo();
32811 MachineInstrBuilder MIB;
32812
32813 // Memory Reference.
32814 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
32815 MI.memoperands_end());
32816
32817 // Initialize a register with zero.
32818 MVT PVT = getPointerTy(MF->getDataLayout());
32819 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
32820 Register ZReg = MRI.createVirtualRegister(PtrRC);
32821 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
32822 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
32823 .addDef(ZReg)
32824 .addReg(ZReg, RegState::Undef)
32825 .addReg(ZReg, RegState::Undef);
32826
32827 // Read the current SSP Register value to the zeroed register.
32828 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
32829 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
32830 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
32831
32832 // Write the SSP register value to offset 3 in input memory buffer.
32833 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
32834 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
32835 const int64_t SSPOffset = 3 * PVT.getStoreSize();
32836 const unsigned MemOpndSlot = 1;
32837 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
32838 if (i == X86::AddrDisp)
32839 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
32840 else
32841 MIB.add(MI.getOperand(MemOpndSlot + i));
32842 }
32843 MIB.addReg(SSPCopyReg);
32844 MIB.setMemRefs(MMOs);
32845}
32846
32847MachineBasicBlock *
32848X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
32849 MachineBasicBlock *MBB) const {
32850 const DebugLoc &DL = MI.getDebugLoc();
32851 MachineFunction *MF = MBB->getParent();
32852 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32853 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32854 MachineRegisterInfo &MRI = MF->getRegInfo();
32855
32856 const BasicBlock *BB = MBB->getBasicBlock();
32857 MachineFunction::iterator I = ++MBB->getIterator();
32858
32859 // Memory Reference
32860 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
32861 MI.memoperands_end());
32862
32863 unsigned DstReg;
32864 unsigned MemOpndSlot = 0;
32865
32866 unsigned CurOp = 0;
32867
32868 DstReg = MI.getOperand(CurOp++).getReg();
32869 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
32870 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")((TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"
) ? static_cast<void> (0) : __assert_fail ("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32870, __PRETTY_FUNCTION__))
;
32871 (void)TRI;
32872 Register mainDstReg = MRI.createVirtualRegister(RC);
32873 Register restoreDstReg = MRI.createVirtualRegister(RC);
32874
32875 MemOpndSlot = CurOp;
32876
32877 MVT PVT = getPointerTy(MF->getDataLayout());
32878 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32879, __PRETTY_FUNCTION__))
32879 "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32879, __PRETTY_FUNCTION__))
;
32880
32881 // For v = setjmp(buf), we generate
32882 //
32883 // thisMBB:
32884 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
32885 // SjLjSetup restoreMBB
32886 //
32887 // mainMBB:
32888 // v_main = 0
32889 //
32890 // sinkMBB:
32891 // v = phi(main, restore)
32892 //
32893 // restoreMBB:
32894 // if base pointer being used, load it from frame
32895 // v_restore = 1
32896
32897 MachineBasicBlock *thisMBB = MBB;
32898 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
32899 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
32900 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
32901 MF->insert(I, mainMBB);
32902 MF->insert(I, sinkMBB);
32903 MF->push_back(restoreMBB);
32904 restoreMBB->setHasAddressTaken();
32905
32906 MachineInstrBuilder MIB;
32907
32908 // Transfer the remainder of BB and its successor edges to sinkMBB.
32909 sinkMBB->splice(sinkMBB->begin(), MBB,
32910 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32911 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32912
32913 // thisMBB:
32914 unsigned PtrStoreOpc = 0;
32915 unsigned LabelReg = 0;
32916 const int64_t LabelOffset = 1 * PVT.getStoreSize();
32917 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
32918 !isPositionIndependent();
32919
32920 // Prepare IP either in reg or imm.
32921 if (!UseImmLabel) {
32922 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
32923 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
32924 LabelReg = MRI.createVirtualRegister(PtrRC);
32925 if (Subtarget.is64Bit()) {
32926 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
32927 .addReg(X86::RIP)
32928 .addImm(0)
32929 .addReg(0)
32930 .addMBB(restoreMBB)
32931 .addReg(0);
32932 } else {
32933 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
32934 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
32935 .addReg(XII->getGlobalBaseReg(MF))
32936 .addImm(0)
32937 .addReg(0)
32938 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
32939 .addReg(0);
32940 }
32941 } else
32942 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
32943 // Store IP
32944 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
32945 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
32946 if (i == X86::AddrDisp)
32947 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
32948 else
32949 MIB.add(MI.getOperand(MemOpndSlot + i));
32950 }
32951 if (!UseImmLabel)
32952 MIB.addReg(LabelReg);
32953 else
32954 MIB.addMBB(restoreMBB);
32955 MIB.setMemRefs(MMOs);
32956
32957 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
32958 emitSetJmpShadowStackFix(MI, thisMBB);
32959 }
32960
32961 // Setup
32962 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
32963 .addMBB(restoreMBB);
32964
32965 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
32966 MIB.addRegMask(RegInfo->getNoPreservedMask());
32967 thisMBB->addSuccessor(mainMBB);
32968 thisMBB->addSuccessor(restoreMBB);
32969
32970 // mainMBB:
32971 // EAX = 0
32972 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
32973 mainMBB->addSuccessor(sinkMBB);
32974
32975 // sinkMBB:
32976 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
32977 TII->get(X86::PHI), DstReg)
32978 .addReg(mainDstReg).addMBB(mainMBB)
32979 .addReg(restoreDstReg).addMBB(restoreMBB);
32980
32981 // restoreMBB:
32982 if (RegInfo->hasBasePointer(*MF)) {
32983 const bool Uses64BitFramePtr =
32984 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
32985 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
32986 X86FI->setRestoreBasePointer(MF);
32987 Register FramePtr = RegInfo->getFrameRegister(*MF);
32988 Register BasePtr = RegInfo->getBaseRegister();
32989 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
32990 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
32991 FramePtr, true, X86FI->getRestoreBasePointerOffset())
32992 .setMIFlag(MachineInstr::FrameSetup);
32993 }
32994 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
32995 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
32996 restoreMBB->addSuccessor(sinkMBB);
32997
32998 MI.eraseFromParent();
32999 return sinkMBB;
33000}
33001
33002/// Fix the shadow stack using the previously saved SSP pointer.
33003/// \sa emitSetJmpShadowStackFix
33004/// \param [in] MI The temporary Machine Instruction for the builtin.
33005/// \param [in] MBB The Machine Basic Block that will be modified.
33006/// \return The sink MBB that will perform the future indirect branch.
33007MachineBasicBlock *
33008X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
33009 MachineBasicBlock *MBB) const {
33010 const DebugLoc &DL = MI.getDebugLoc();
33011 MachineFunction *MF = MBB->getParent();
33012 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33013 MachineRegisterInfo &MRI = MF->getRegInfo();
33014
33015 // Memory Reference
33016 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33017 MI.memoperands_end());
33018
33019 MVT PVT = getPointerTy(MF->getDataLayout());
33020 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33021
33022 // checkSspMBB:
33023 // xor vreg1, vreg1
33024 // rdssp vreg1
33025 // test vreg1, vreg1
33026 // je sinkMBB # Jump if Shadow Stack is not supported
33027 // fallMBB:
33028 // mov buf+24/12(%rip), vreg2
33029 // sub vreg1, vreg2
33030 // jbe sinkMBB # No need to fix the Shadow Stack
33031 // fixShadowMBB:
33032 // shr 3/2, vreg2
33033 // incssp vreg2 # fix the SSP according to the lower 8 bits
33034 // shr 8, vreg2
33035 // je sinkMBB
33036 // fixShadowLoopPrepareMBB:
33037 // shl vreg2
33038 // mov 128, vreg3
33039 // fixShadowLoopMBB:
33040 // incssp vreg3
33041 // dec vreg2
33042 // jne fixShadowLoopMBB # Iterate until you finish fixing
33043 // # the Shadow Stack
33044 // sinkMBB:
33045
33046 MachineFunction::iterator I = ++MBB->getIterator();
33047 const BasicBlock *BB = MBB->getBasicBlock();
33048
33049 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
33050 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33051 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
33052 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
33053 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
33054 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33055 MF->insert(I, checkSspMBB);
33056 MF->insert(I, fallMBB);
33057 MF->insert(I, fixShadowMBB);
33058 MF->insert(I, fixShadowLoopPrepareMBB);
33059 MF->insert(I, fixShadowLoopMBB);
33060 MF->insert(I, sinkMBB);
33061
33062 // Transfer the remainder of BB and its successor edges to sinkMBB.
33063 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
33064 MBB->end());
33065 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33066
33067 MBB->addSuccessor(checkSspMBB);
33068
33069 // Initialize a register with zero.
33070 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
33071 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
33072
33073 if (PVT == MVT::i64) {
33074 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
33075 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
33076 .addImm(0)
33077 .addReg(ZReg)
33078 .addImm(X86::sub_32bit);
33079 ZReg = TmpZReg;
33080 }
33081
33082 // Read the current SSP Register value to the zeroed register.
33083 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33084 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33085 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33086
33087 // Check whether the result of the SSP register is zero and jump directly
33088 // to the sink.
33089 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
33090 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
33091 .addReg(SSPCopyReg)
33092 .addReg(SSPCopyReg);
33093 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33094 checkSspMBB->addSuccessor(sinkMBB);
33095 checkSspMBB->addSuccessor(fallMBB);
33096
33097 // Reload the previously saved SSP register value.
33098 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
33099 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33100 const int64_t SPPOffset = 3 * PVT.getStoreSize();
33101 MachineInstrBuilder MIB =
33102 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
33103 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33104 const MachineOperand &MO = MI.getOperand(i);
33105 if (i == X86::AddrDisp)
33106 MIB.addDisp(MO, SPPOffset);
33107 else if (MO.isReg()) // Don't add the whole operand, we don't want to
33108 // preserve kill flags.
33109 MIB.addReg(MO.getReg());
33110 else
33111 MIB.add(MO);
33112 }
33113 MIB.setMemRefs(MMOs);
33114
33115 // Subtract the current SSP from the previous SSP.
33116 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
33117 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
33118 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
33119 .addReg(PrevSSPReg)
33120 .addReg(SSPCopyReg);
33121
33122 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
33123 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
33124 fallMBB->addSuccessor(sinkMBB);
33125 fallMBB->addSuccessor(fixShadowMBB);
33126
33127 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
33128 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
33129 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
33130 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
33131 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
33132 .addReg(SspSubReg)
33133 .addImm(Offset);
33134
33135 // Increase SSP when looking only on the lower 8 bits of the delta.
33136 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
33137 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
33138
33139 // Reset the lower 8 bits.
33140 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
33141 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
33142 .addReg(SspFirstShrReg)
33143 .addImm(8);
33144
33145 // Jump if the result of the shift is zero.
33146 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33147 fixShadowMBB->addSuccessor(sinkMBB);
33148 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
33149
33150 // Do a single shift left.
33151 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
33152 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
33153 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
33154 .addReg(SspSecondShrReg);
33155
33156 // Save the value 128 to a register (will be used next with incssp).
33157 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
33158 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
33159 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
33160 .addImm(128);
33161 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
33162
33163 // Since incssp only looks at the lower 8 bits, we might need to do several
33164 // iterations of incssp until we finish fixing the shadow stack.
33165 Register DecReg = MRI.createVirtualRegister(PtrRC);
33166 Register CounterReg = MRI.createVirtualRegister(PtrRC);
33167 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
33168 .addReg(SspAfterShlReg)
33169 .addMBB(fixShadowLoopPrepareMBB)
33170 .addReg(DecReg)
33171 .addMBB(fixShadowLoopMBB);
33172
33173 // Every iteration we increase the SSP by 128.
33174 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
33175
33176 // Every iteration we decrement the counter by 1.
33177 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
33178 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
33179
33180 // Jump if the counter is not zero yet.
33181 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
33182 fixShadowLoopMBB->addSuccessor(sinkMBB);
33183 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
33184
33185 return sinkMBB;
33186}
33187
33188MachineBasicBlock *
33189X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
33190 MachineBasicBlock *MBB) const {
33191 const DebugLoc &DL = MI.getDebugLoc();
33192 MachineFunction *MF = MBB->getParent();
33193 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33194 MachineRegisterInfo &MRI = MF->getRegInfo();
33195
33196 // Memory Reference
33197 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33198 MI.memoperands_end());
33199
33200 MVT PVT = getPointerTy(MF->getDataLayout());
33201 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33202, __PRETTY_FUNCTION__))
33202 "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33202, __PRETTY_FUNCTION__))
;
33203
33204 const TargetRegisterClass *RC =
33205 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33206 Register Tmp = MRI.createVirtualRegister(RC);
33207 // Since FP is only updated here but NOT referenced, it's treated as GPR.
33208 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33209 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
33210 Register SP = RegInfo->getStackRegister();
33211
33212 MachineInstrBuilder MIB;
33213
33214 const int64_t LabelOffset = 1 * PVT.getStoreSize();
33215 const int64_t SPOffset = 2 * PVT.getStoreSize();
33216
33217 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33218 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
33219
33220 MachineBasicBlock *thisMBB = MBB;
33221
33222 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
33223 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33224 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
33225 }
33226
33227 // Reload FP
33228 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
33229 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33230 const MachineOperand &MO = MI.getOperand(i);
33231 if (MO.isReg()) // Don't add the whole operand, we don't want to
33232 // preserve kill flags.
33233 MIB.addReg(MO.getReg());
33234 else
33235 MIB.add(MO);
33236 }
33237 MIB.setMemRefs(MMOs);
33238
33239 // Reload IP
33240 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
33241 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33242 const MachineOperand &MO = MI.getOperand(i);
33243 if (i == X86::AddrDisp)
33244 MIB.addDisp(MO, LabelOffset);
33245 else if (MO.isReg()) // Don't add the whole operand, we don't want to
33246 // preserve kill flags.
33247 MIB.addReg(MO.getReg());
33248 else
33249 MIB.add(MO);
33250 }
33251 MIB.setMemRefs(MMOs);
33252
33253 // Reload SP
33254 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
33255 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33256 if (i == X86::AddrDisp)
33257 MIB.addDisp(MI.getOperand(i), SPOffset);
33258 else
33259 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
33260 // the last instruction of the expansion.
33261 }
33262 MIB.setMemRefs(MMOs);
33263
33264 // Jump
33265 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
33266
33267 MI.eraseFromParent();
33268 return thisMBB;
33269}
33270
33271void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
33272 MachineBasicBlock *MBB,
33273 MachineBasicBlock *DispatchBB,
33274 int FI) const {
33275 const DebugLoc &DL = MI.getDebugLoc();
33276 MachineFunction *MF = MBB->getParent();
33277 MachineRegisterInfo *MRI = &MF->getRegInfo();
33278 const X86InstrInfo *TII = Subtarget.getInstrInfo();
33279
33280 MVT PVT = getPointerTy(MF->getDataLayout());
33281 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33281, __PRETTY_FUNCTION__))
;
33282
33283 unsigned Op = 0;
33284 unsigned VR = 0;
33285
33286 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33287 !isPositionIndependent();
33288
33289 if (UseImmLabel) {
33290 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33291 } else {
33292 const TargetRegisterClass *TRC =
33293 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33294 VR = MRI->createVirtualRegister(TRC);
33295 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33296
33297 if (Subtarget.is64Bit())
33298 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
33299 .addReg(X86::RIP)
33300 .addImm(1)
33301 .addReg(0)
33302 .addMBB(DispatchBB)
33303 .addReg(0);
33304 else
33305 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
33306 .addReg(0) /* TII->getGlobalBaseReg(MF) */
33307 .addImm(1)
33308 .addReg(0)
33309 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
33310 .addReg(0);
33311 }
33312
33313 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
33314 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
33315 if (UseImmLabel)
33316 MIB.addMBB(DispatchBB);
33317 else
33318 MIB.addReg(VR);
33319}
33320
33321MachineBasicBlock *
33322X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
33323 MachineBasicBlock *BB) const {
33324 const DebugLoc &DL = MI.getDebugLoc();
33325 MachineFunction *MF = BB->getParent();
33326 MachineRegisterInfo *MRI = &MF->getRegInfo();
33327 const X86InstrInfo *TII = Subtarget.getInstrInfo();
33328 int FI = MF->getFrameInfo().getFunctionContextIndex();
33329
33330 // Get a mapping of the call site numbers to all of the landing pads they're
33331 // associated with.
33332 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
33333 unsigned MaxCSNum = 0;
33334 for (auto &MBB : *MF) {
33335 if (!MBB.isEHPad())
33336 continue;
33337
33338 MCSymbol *Sym = nullptr;
33339 for (const auto &MI : MBB) {
33340 if (MI.isDebugInstr())
33341 continue;
33342
33343 assert(MI.isEHLabel() && "expected EH_LABEL")((MI.isEHLabel() && "expected EH_LABEL") ? static_cast
<void> (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33343, __PRETTY_FUNCTION__))
;
33344 Sym = MI.getOperand(0).getMCSymbol();
33345 break;
33346 }
33347
33348 if (!MF->hasCallSiteLandingPad(Sym))
33349 continue;
33350
33351 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
33352 CallSiteNumToLPad[CSI].push_back(&MBB);
33353 MaxCSNum = std::max(MaxCSNum, CSI);
33354 }
33355 }
33356
33357 // Get an ordered list of the machine basic blocks for the jump table.
33358 std::vector<MachineBasicBlock *> LPadList;
33359 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
33360 LPadList.reserve(CallSiteNumToLPad.size());
33361
33362 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
33363 for (auto &LP : CallSiteNumToLPad[CSI]) {
33364 LPadList.push_back(LP);
33365 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
33366 }
33367 }
33368
33369 assert(!LPadList.empty() &&((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33370, __PRETTY_FUNCTION__))
33370 "No landing pad destinations for the dispatch jump table!")((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33370, __PRETTY_FUNCTION__))
;
33371
33372 // Create the MBBs for the dispatch code.
33373
33374 // Shove the dispatch's address into the return slot in the function context.
33375 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
33376 DispatchBB->setIsEHPad(true);
33377
33378 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
33379 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
33380 DispatchBB->addSuccessor(TrapBB);
33381
33382 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
33383 DispatchBB->addSuccessor(DispContBB);
33384
33385 // Insert MBBs.
33386 MF->push_back(DispatchBB);
33387 MF->push_back(DispContBB);
33388 MF->push_back(TrapBB);
33389
33390 // Insert code into the entry block that creates and registers the function
33391 // context.
33392 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
33393
33394 // Create the jump table and associated information
33395 unsigned JTE = getJumpTableEncoding();
33396 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
33397 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
33398
33399 const X86RegisterInfo &RI = TII->getRegisterInfo();
33400 // Add a register mask with no preserved registers. This results in all
33401 // registers being marked as clobbered.
33402 if (RI.hasBasePointer(*MF)) {
33403 const bool FPIs64Bit =
33404 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
33405 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
33406 MFI->setRestoreBasePointer(MF);
33407
33408 Register FP = RI.getFrameRegister(*MF);
33409 Register BP = RI.getBaseRegister();
33410 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
33411 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
33412 MFI->getRestoreBasePointerOffset())
33413 .addRegMask(RI.getNoPreservedMask());
33414 } else {
33415 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
33416 .addRegMask(RI.getNoPreservedMask());
33417 }
33418
33419 // IReg is used as an index in a memory operand and therefore can't be SP
33420 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
33421 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
33422 Subtarget.is64Bit() ? 8 : 4);
33423 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
33424 .addReg(IReg)
33425 .addImm(LPadList.size());
33426 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
33427
33428 if (Subtarget.is64Bit()) {
33429 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
33430 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
33431
33432 // leaq .LJTI0_0(%rip), BReg
33433 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
33434 .addReg(X86::RIP)
33435 .addImm(1)
33436 .addReg(0)
33437 .addJumpTableIndex(MJTI)
33438 .addReg(0);
33439 // movzx IReg64, IReg
33440 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
33441 .addImm(0)
33442 .addReg(IReg)
33443 .addImm(X86::sub_32bit);
33444
33445 switch (JTE) {
33446 case MachineJumpTableInfo::EK_BlockAddress:
33447 // jmpq *(BReg,IReg64,8)
33448 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
33449 .addReg(BReg)
33450 .addImm(8)
33451 .addReg(IReg64)
33452 .addImm(0)
33453 .addReg(0);
33454 break;
33455 case MachineJumpTableInfo::EK_LabelDifference32: {
33456 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
33457 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
33458 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
33459
33460 // movl (BReg,IReg64,4), OReg
33461 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
33462 .addReg(BReg)
33463 .addImm(4)
33464 .addReg(IReg64)
33465 .addImm(0)
33466 .addReg(0);
33467 // movsx OReg64, OReg
33468 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
33469 // addq BReg, OReg64, TReg
33470 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
33471 .addReg(OReg64)
33472 .addReg(BReg);
33473 // jmpq *TReg
33474 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
33475 break;
33476 }
33477 default:
33478 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33478)
;
33479 }
33480 } else {
33481 // jmpl *.LJTI0_0(,IReg,4)
33482 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
33483 .addReg(0)
33484 .addImm(4)
33485 .addReg(IReg)
33486 .addJumpTableIndex(MJTI)
33487 .addReg(0);
33488 }
33489
33490 // Add the jump table entries as successors to the MBB.
33491 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
33492 for (auto &LP : LPadList)
33493 if (SeenMBBs.insert(LP).second)
33494 DispContBB->addSuccessor(LP);
33495
33496 // N.B. the order the invoke BBs are processed in doesn't matter here.
33497 SmallVector<MachineBasicBlock *, 64> MBBLPads;
33498 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
33499 for (MachineBasicBlock *MBB : InvokeBBs) {
33500 // Remove the landing pad successor from the invoke block and replace it
33501 // with the new dispatch block.
33502 // Keep a copy of Successors since it's modified inside the loop.
33503 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
33504 MBB->succ_rend());
33505 // FIXME: Avoid quadratic complexity.
33506 for (auto MBBS : Successors) {
33507 if (MBBS->isEHPad()) {
33508 MBB->removeSuccessor(MBBS);
33509 MBBLPads.push_back(MBBS);
33510 }
33511 }
33512
33513 MBB->addSuccessor(DispatchBB);
33514
33515 // Find the invoke call and mark all of the callee-saved registers as
33516 // 'implicit defined' so that they're spilled. This prevents code from
33517 // moving instructions to before the EH block, where they will never be
33518 // executed.
33519 for (auto &II : reverse(*MBB)) {
33520 if (!II.isCall())
33521 continue;
33522
33523 DenseMap<unsigned, bool> DefRegs;
33524 for (auto &MOp : II.operands())
33525 if (MOp.isReg())
33526 DefRegs[MOp.getReg()] = true;
33527
33528 MachineInstrBuilder MIB(*MF, &II);
33529 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
33530 unsigned Reg = SavedRegs[RegIdx];
33531 if (!DefRegs[Reg])
33532 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
33533 }
33534
33535 break;
33536 }
33537 }
33538
33539 // Mark all former landing pads as non-landing pads. The dispatch is the only
33540 // landing pad now.
33541 for (auto &LP : MBBLPads)
33542 LP->setIsEHPad(false);
33543
33544 // The instruction is gone now.
33545 MI.eraseFromParent();
33546 return BB;
33547}
33548
33549MachineBasicBlock *
33550X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
33551 MachineBasicBlock *BB) const {
33552 MachineFunction *MF = BB->getParent();
33553 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33554 const DebugLoc &DL = MI.getDebugLoc();
33555
33556 auto TMMImmToTMMReg = [](unsigned Imm) {
33557 assert (Imm < 8 && "Illegal tmm index")((Imm < 8 && "Illegal tmm index") ? static_cast<
void> (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33557, __PRETTY_FUNCTION__))
;
33558 return X86::TMM0 + Imm;
33559 };
33560 switch (MI.getOpcode()) {
33561 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33561)
;
33562 case X86::TLS_addr32:
33563 case X86::TLS_addr64:
33564 case X86::TLS_addrX32:
33565 case X86::TLS_base_addr32:
33566 case X86::TLS_base_addr64:
33567 case X86::TLS_base_addrX32:
33568 return EmitLoweredTLSAddr(MI, BB);
33569 case X86::INDIRECT_THUNK_CALL32:
33570 case X86::INDIRECT_THUNK_CALL64:
33571 case X86::INDIRECT_THUNK_TCRETURN32:
33572 case X86::INDIRECT_THUNK_TCRETURN64:
33573 return EmitLoweredIndirectThunk(MI, BB);
33574 case X86::CATCHRET:
33575 return EmitLoweredCatchRet(MI, BB);
33576 case X86::SEG_ALLOCA_32:
33577 case X86::SEG_ALLOCA_64:
33578 return EmitLoweredSegAlloca(MI, BB);
33579 case X86::PROBED_ALLOCA_32:
33580 case X86::PROBED_ALLOCA_64:
33581 return EmitLoweredProbedAlloca(MI, BB);
33582 case X86::TLSCall_32:
33583 case X86::TLSCall_64:
33584 return EmitLoweredTLSCall(MI, BB);
33585 case X86::CMOV_FR32:
33586 case X86::CMOV_FR32X:
33587 case X86::CMOV_FR64:
33588 case X86::CMOV_FR64X:
33589 case X86::CMOV_GR8:
33590 case X86::CMOV_GR16:
33591 case X86::CMOV_GR32:
33592 case X86::CMOV_RFP32:
33593 case X86::CMOV_RFP64:
33594 case X86::CMOV_RFP80:
33595 case X86::CMOV_VR64:
33596 case X86::CMOV_VR128:
33597 case X86::CMOV_VR128X:
33598 case X86::CMOV_VR256:
33599 case X86::CMOV_VR256X:
33600 case X86::CMOV_VR512:
33601 case X86::CMOV_VK1:
33602 case X86::CMOV_VK2:
33603 case X86::CMOV_VK4:
33604 case X86::CMOV_VK8:
33605 case X86::CMOV_VK16:
33606 case X86::CMOV_VK32:
33607 case X86::CMOV_VK64:
33608 return EmitLoweredSelect(MI, BB);
33609
33610 case X86::RDFLAGS32:
33611 case X86::RDFLAGS64: {
33612 unsigned PushF =
33613 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
33614 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
33615 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
33616 // Permit reads of the EFLAGS and DF registers without them being defined.
33617 // This intrinsic exists to read external processor state in flags, such as
33618 // the trap flag, interrupt flag, and direction flag, none of which are
33619 // modeled by the backend.
33620 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&((Push->getOperand(2).getReg() == X86::EFLAGS && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33621, __PRETTY_FUNCTION__))
33621 "Unexpected register in operand!")((Push->getOperand(2).getReg() == X86::EFLAGS && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33621, __PRETTY_FUNCTION__))
;
33622 Push->getOperand(2).setIsUndef();
33623 assert(Push->getOperand(3).getReg() == X86::DF &&((Push->getOperand(3).getReg() == X86::DF && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33624, __PRETTY_FUNCTION__))
33624 "Unexpected register in operand!")((Push->getOperand(3).getReg() == X86::DF && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33624, __PRETTY_FUNCTION__))
;
33625 Push->getOperand(3).setIsUndef();
33626 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
33627
33628 MI.eraseFromParent(); // The pseudo is gone now.
33629 return BB;
33630 }
33631
33632 case X86::WRFLAGS32:
33633 case X86::WRFLAGS64: {
33634 unsigned Push =
33635 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
33636 unsigned PopF =
33637 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
33638 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
33639 BuildMI(*BB, MI, DL, TII->get(PopF));
33640
33641 MI.eraseFromParent(); // The pseudo is gone now.
33642 return BB;
33643 }
33644
33645 case X86::FP32_TO_INT16_IN_MEM:
33646 case X86::FP32_TO_INT32_IN_MEM:
33647 case X86::FP32_TO_INT64_IN_MEM:
33648 case X86::FP64_TO_INT16_IN_MEM:
33649 case X86::FP64_TO_INT32_IN_MEM:
33650 case X86::FP64_TO_INT64_IN_MEM:
33651 case X86::FP80_TO_INT16_IN_MEM:
33652 case X86::FP80_TO_INT32_IN_MEM:
33653 case X86::FP80_TO_INT64_IN_MEM: {
33654 // Change the floating point control register to use "round towards zero"
33655 // mode when truncating to an integer value.
33656 int OrigCWFrameIdx =
33657 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
33658 addFrameReference(BuildMI(*BB, MI, DL,
33659 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
33660
33661 // Load the old value of the control word...
33662 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
33663 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
33664 OrigCWFrameIdx);
33665
33666 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
33667 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
33668 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
33669 .addReg(OldCW, RegState::Kill).addImm(0xC00);
33670
33671 // Extract to 16 bits.
33672 Register NewCW16 =
33673 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
33674 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
33675 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
33676
33677 // Prepare memory for FLDCW.
33678 int NewCWFrameIdx =
33679 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
33680 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
33681 NewCWFrameIdx)
33682 .addReg(NewCW16, RegState::Kill);
33683
33684 // Reload the modified control word now...
33685 addFrameReference(BuildMI(*BB, MI, DL,
33686 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
33687
33688 // Get the X86 opcode to use.
33689 unsigned Opc;
33690 switch (MI.getOpcode()) {
33691 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33691)
;
33692 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
33693 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
33694 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
33695 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
33696 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
33697 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
33698 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
33699 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
33700 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
33701 }
33702
33703 X86AddressMode AM = getAddressFromInstr(&MI, 0);
33704 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
33705 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
33706
33707 // Reload the original control word now.
33708 addFrameReference(BuildMI(*BB, MI, DL,
33709 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
33710
33711 MI.eraseFromParent(); // The pseudo instruction is gone now.
33712 return BB;
33713 }
33714
33715 // xbegin
33716 case X86::XBEGIN:
33717 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
33718
33719 case X86::VASTART_SAVE_XMM_REGS:
33720 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
33721
33722 case X86::VAARG_64:
33723 return EmitVAARG64WithCustomInserter(MI, BB);
33724
33725 case X86::EH_SjLj_SetJmp32:
33726 case X86::EH_SjLj_SetJmp64:
33727 return emitEHSjLjSetJmp(MI, BB);
33728
33729 case X86::EH_SjLj_LongJmp32:
33730 case X86::EH_SjLj_LongJmp64:
33731 return emitEHSjLjLongJmp(MI, BB);
33732
33733 case X86::Int_eh_sjlj_setup_dispatch:
33734 return EmitSjLjDispatchBlock(MI, BB);
33735
33736 case TargetOpcode::STATEPOINT:
33737 // As an implementation detail, STATEPOINT shares the STACKMAP format at
33738 // this point in the process. We diverge later.
33739 return emitPatchPoint(MI, BB);
33740
33741 case TargetOpcode::STACKMAP:
33742 case TargetOpcode::PATCHPOINT:
33743 return emitPatchPoint(MI, BB);
33744
33745 case TargetOpcode::PATCHABLE_EVENT_CALL:
33746 return emitXRayCustomEvent(MI, BB);
33747
33748 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
33749 return emitXRayTypedEvent(MI, BB);
33750
33751 case X86::LCMPXCHG8B: {
33752 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
33753 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
33754 // requires a memory operand. If it happens that current architecture is
33755 // i686 and for current function we need a base pointer
33756 // - which is ESI for i686 - register allocator would not be able to
33757 // allocate registers for an address in form of X(%reg, %reg, Y)
33758 // - there never would be enough unreserved registers during regalloc
33759 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
33760 // We are giving a hand to register allocator by precomputing the address in
33761 // a new vreg using LEA.
33762
33763 // If it is not i686 or there is no base pointer - nothing to do here.
33764 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
33765 return BB;
33766
33767 // Even though this code does not necessarily needs the base pointer to
33768 // be ESI, we check for that. The reason: if this assert fails, there are
33769 // some changes happened in the compiler base pointer handling, which most
33770 // probably have to be addressed somehow here.
33771 assert(TRI->getBaseRegister() == X86::ESI &&((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33773, __PRETTY_FUNCTION__))
33772 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33773, __PRETTY_FUNCTION__))
33773 "base pointer in mind")((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33773, __PRETTY_FUNCTION__))
;
33774
33775 MachineRegisterInfo &MRI = MF->getRegInfo();
33776 MVT SPTy = getPointerTy(MF->getDataLayout());
33777 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
33778 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
33779
33780 X86AddressMode AM = getAddressFromInstr(&MI, 0);
33781 // Regalloc does not need any help when the memory operand of CMPXCHG8B
33782 // does not use index register.
33783 if (AM.IndexReg == X86::NoRegister)
33784 return BB;
33785
33786 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
33787 // four operand definitions that are E[ABCD] registers. We skip them and
33788 // then insert the LEA.
33789 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
33790 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
33791 RMBBI->definesRegister(X86::EBX) ||
33792 RMBBI->definesRegister(X86::ECX) ||
33793 RMBBI->definesRegister(X86::EDX))) {
33794 ++RMBBI;
33795 }
33796 MachineBasicBlock::iterator MBBI(RMBBI);
33797 addFullAddress(
33798 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
33799
33800 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
33801
33802 return BB;
33803 }
33804 case X86::LCMPXCHG16B_NO_RBX: {
33805 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
33806 Register BasePtr = TRI->getBaseRegister();
33807 if (TRI->hasBasePointer(*MF) &&
33808 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
33809 if (!BB->isLiveIn(BasePtr))
33810 BB->addLiveIn(BasePtr);
33811 // Save RBX into a virtual register.
33812 Register SaveRBX =
33813 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
33814 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
33815 .addReg(X86::RBX);
33816 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
33817 MachineInstrBuilder MIB =
33818 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
33819 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
33820 MIB.add(MI.getOperand(Idx));
33821 MIB.add(MI.getOperand(X86::AddrNumOperands));
33822 MIB.addReg(SaveRBX);
33823 } else {
33824 // Simple case, just copy the virtual register to RBX.
33825 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
33826 .add(MI.getOperand(X86::AddrNumOperands));
33827 MachineInstrBuilder MIB =
33828 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
33829 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
33830 MIB.add(MI.getOperand(Idx));
33831 }
33832 MI.eraseFromParent();
33833 return BB;
33834 }
33835 case X86::MWAITX: {
33836 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
33837 Register BasePtr = TRI->getBaseRegister();
33838 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
33839 // If no need to save the base pointer, we generate MWAITXrrr,
33840 // else we generate pseudo MWAITX_SAVE_RBX.
33841 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
33842 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
33843 .addReg(MI.getOperand(0).getReg());
33844 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
33845 .addReg(MI.getOperand(1).getReg());
33846 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
33847 .addReg(MI.getOperand(2).getReg());
33848 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
33849 MI.eraseFromParent();
33850 } else {
33851 if (!BB->isLiveIn(BasePtr)) {
33852 BB->addLiveIn(BasePtr);
33853 }
33854 // Parameters can be copied into ECX and EAX but not EBX yet.
33855 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
33856 .addReg(MI.getOperand(0).getReg());
33857 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
33858 .addReg(MI.getOperand(1).getReg());
33859 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")((Subtarget.is64Bit() && "Expected 64-bit mode!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33859, __PRETTY_FUNCTION__))
;
33860 // Save RBX into a virtual register.
33861 Register SaveRBX =
33862 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
33863 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
33864 .addReg(X86::RBX);
33865 // Generate mwaitx pseudo.
33866 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
33867 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
33868 .addDef(Dst) // Destination tied in with SaveRBX.
33869 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
33870 .addUse(SaveRBX); // Save of base pointer.
33871 MI.eraseFromParent();
33872 }
33873 return BB;
33874 }
33875 case TargetOpcode::PREALLOCATED_SETUP: {
33876 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")((Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33876, __PRETTY_FUNCTION__))
;
33877 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
33878 MFI->setHasPreallocatedCall(true);
33879 int64_t PreallocatedId = MI.getOperand(0).getImm();
33880 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
33881 assert(StackAdjustment != 0 && "0 stack adjustment")((StackAdjustment != 0 && "0 stack adjustment") ? static_cast
<void> (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33881, __PRETTY_FUNCTION__))
;
33882 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
33883 << StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
;
33884 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
33885 .addReg(X86::ESP)
33886 .addImm(StackAdjustment);
33887 MI.eraseFromParent();
33888 return BB;
33889 }
33890 case TargetOpcode::PREALLOCATED_ARG: {
33891 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")((Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33891, __PRETTY_FUNCTION__))
;
33892 int64_t PreallocatedId = MI.getOperand(1).getImm();
33893 int64_t ArgIdx = MI.getOperand(2).getImm();
33894 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
33895 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
33896 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
33897 << ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
;
33898 // stack pointer + offset
33899 addRegOffset(
33900 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
33901 X86::ESP, false, ArgOffset);
33902 MI.eraseFromParent();
33903 return BB;
33904 }
33905 case X86::PTDPBSSD:
33906 case X86::PTDPBSUD:
33907 case X86::PTDPBUSD:
33908 case X86::PTDPBUUD:
33909 case X86::PTDPBF16PS: {
33910 unsigned Opc;
33911 switch (MI.getOpcode()) {
33912 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
33913 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
33914 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
33915 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
33916 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
33917 }
33918
33919 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
33920 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
33921 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
33922 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
33923 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
33924
33925 MI.eraseFromParent(); // The pseudo is gone now.
33926 return BB;
33927 }
33928 case X86::PTILEZERO: {
33929 unsigned Imm = MI.getOperand(0).getImm();
33930 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
33931 MI.eraseFromParent(); // The pseudo is gone now.
33932 return BB;
33933 }
33934 case X86::PTILELOADD:
33935 case X86::PTILELOADDT1:
33936 case X86::PTILESTORED: {
33937 unsigned Opc;
33938 switch (MI.getOpcode()) {
33939 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
33940 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
33941 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
33942 }
33943
33944 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
33945 unsigned CurOp = 0;
33946 if (Opc != X86::TILESTORED)
33947 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
33948 RegState::Define);
33949
33950 MIB.add(MI.getOperand(CurOp++)); // base
33951 MIB.add(MI.getOperand(CurOp++)); // scale
33952 MIB.add(MI.getOperand(CurOp++)); // index -- stride
33953 MIB.add(MI.getOperand(CurOp++)); // displacement
33954 MIB.add(MI.getOperand(CurOp++)); // segment
33955
33956 if (Opc == X86::TILESTORED)
33957 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
33958 RegState::Undef);
33959
33960 MI.eraseFromParent(); // The pseudo is gone now.
33961 return BB;
33962 }
33963 }
33964}
33965
33966//===----------------------------------------------------------------------===//
33967// X86 Optimization Hooks
33968//===----------------------------------------------------------------------===//
33969
33970bool
33971X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
33972 const APInt &DemandedBits,
33973 const APInt &DemandedElts,
33974 TargetLoweringOpt &TLO) const {
33975 EVT VT = Op.getValueType();
33976 unsigned Opcode = Op.getOpcode();
33977 unsigned EltSize = VT.getScalarSizeInBits();
33978
33979 if (VT.isVector()) {
33980 // If the constant is only all signbits in the active bits, then we should
33981 // extend it to the entire constant to allow it act as a boolean constant
33982 // vector.
33983 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
33984 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
33985 return false;
33986 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
33987 if (!DemandedElts[i] || V.getOperand(i).isUndef())
33988 continue;
33989 const APInt &Val = V.getConstantOperandAPInt(i);
33990 if (Val.getBitWidth() > Val.getNumSignBits() &&
33991 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
33992 return true;
33993 }
33994 return false;
33995 };
33996 // For vectors - if we have a constant, then try to sign extend.
33997 // TODO: Handle AND/ANDN cases.
33998 unsigned ActiveBits = DemandedBits.getActiveBits();
33999 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
34000 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
34001 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
34002 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
34003 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
34004 VT.getVectorNumElements());
34005 SDValue NewC =
34006 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
34007 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
34008 SDValue NewOp =
34009 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
34010 return TLO.CombineTo(Op, NewOp);
34011 }
34012 return false;
34013 }
34014
34015 // Only optimize Ands to prevent shrinking a constant that could be
34016 // matched by movzx.
34017 if (Opcode != ISD::AND)
34018 return false;
34019
34020 // Make sure the RHS really is a constant.
34021 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
34022 if (!C)
34023 return false;
34024
34025 const APInt &Mask = C->getAPIntValue();
34026
34027 // Clear all non-demanded bits initially.
34028 APInt ShrunkMask = Mask & DemandedBits;
34029
34030 // Find the width of the shrunk mask.
34031 unsigned Width = ShrunkMask.getActiveBits();
34032
34033 // If the mask is all 0s there's nothing to do here.
34034 if (Width == 0)
34035 return false;
34036
34037 // Find the next power of 2 width, rounding up to a byte.
34038 Width = PowerOf2Ceil(std::max(Width, 8U));
34039 // Truncate the width to size to handle illegal types.
34040 Width = std::min(Width, EltSize);
34041
34042 // Calculate a possible zero extend mask for this constant.
34043 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
34044
34045 // If we aren't changing the mask, just return true to keep it and prevent
34046 // the caller from optimizing.
34047 if (ZeroExtendMask == Mask)
34048 return true;
34049
34050 // Make sure the new mask can be represented by a combination of mask bits
34051 // and non-demanded bits.
34052 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
34053 return false;
34054
34055 // Replace the constant with the zero extend mask.
34056 SDLoc DL(Op);
34057 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
34058 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
34059 return TLO.CombineTo(Op, NewOp);
34060}
34061
34062void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
34063 KnownBits &Known,
34064 const APInt &DemandedElts,
34065 const SelectionDAG &DAG,
34066 unsigned Depth) const {
34067 unsigned BitWidth = Known.getBitWidth();
34068 unsigned NumElts = DemandedElts.getBitWidth();
34069 unsigned Opc = Op.getOpcode();
34070 EVT VT = Op.getValueType();
34071 assert((Opc >= ISD::BUILTIN_OP_END ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34076, __PRETTY_FUNCTION__))
34072 Opc == ISD::INTRINSIC_WO_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34076, __PRETTY_FUNCTION__))
34073 Opc == ISD::INTRINSIC_W_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34076, __PRETTY_FUNCTION__))
34074 Opc == ISD::INTRINSIC_VOID) &&(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34076, __PRETTY_FUNCTION__))
34075 "Should use MaskedValueIsZero if you don't know whether Op"(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34076, __PRETTY_FUNCTION__))
34076 " is a target node!")(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34076, __PRETTY_FUNCTION__))
;
34077
34078 Known.resetAll();
34079 switch (Opc) {
34080 default: break;
34081 case X86ISD::SETCC:
34082 Known.Zero.setBitsFrom(1);
34083 break;
34084 case X86ISD::MOVMSK: {
34085 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
34086 Known.Zero.setBitsFrom(NumLoBits);
34087 break;
34088 }
34089 case X86ISD::PEXTRB:
34090 case X86ISD::PEXTRW: {
34091 SDValue Src = Op.getOperand(0);
34092 EVT SrcVT = Src.getValueType();
34093 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
34094 Op.getConstantOperandVal(1));
34095 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
34096 Known = Known.anyextOrTrunc(BitWidth);
34097 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
34098 break;
34099 }
34100 case X86ISD::VSRAI:
34101 case X86ISD::VSHLI:
34102 case X86ISD::VSRLI: {
34103 unsigned ShAmt = Op.getConstantOperandVal(1);
34104 if (ShAmt >= VT.getScalarSizeInBits()) {
34105 Known.setAllZero();
34106 break;
34107 }
34108
34109 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34110 if (Opc == X86ISD::VSHLI) {
34111 Known.Zero <<= ShAmt;
34112 Known.One <<= ShAmt;
34113 // Low bits are known zero.
34114 Known.Zero.setLowBits(ShAmt);
34115 } else if (Opc == X86ISD::VSRLI) {
34116 Known.Zero.lshrInPlace(ShAmt);
34117 Known.One.lshrInPlace(ShAmt);
34118 // High bits are known zero.
34119 Known.Zero.setHighBits(ShAmt);
34120 } else {
34121 Known.Zero.ashrInPlace(ShAmt);
34122 Known.One.ashrInPlace(ShAmt);
34123 }
34124 break;
34125 }
34126 case X86ISD::PACKUS: {
34127 // PACKUS is just a truncation if the upper half is zero.
34128 APInt DemandedLHS, DemandedRHS;
34129 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
34130
34131 Known.One = APInt::getAllOnesValue(BitWidth * 2);
34132 Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
34133
34134 KnownBits Known2;
34135 if (!!DemandedLHS) {
34136 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
34137 Known = KnownBits::commonBits(Known, Known2);
34138 }
34139 if (!!DemandedRHS) {
34140 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
34141 Known = KnownBits::commonBits(Known, Known2);
34142 }
34143
34144 if (Known.countMinLeadingZeros() < BitWidth)
34145 Known.resetAll();
34146 Known = Known.trunc(BitWidth);
34147 break;
34148 }
34149 case X86ISD::ANDNP: {
34150 KnownBits Known2;
34151 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34152 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34153
34154 // ANDNP = (~X & Y);
34155 Known.One &= Known2.Zero;
34156 Known.Zero |= Known2.One;
34157 break;
34158 }
34159 case X86ISD::FOR: {
34160 KnownBits Known2;
34161 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34162 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34163
34164 Known |= Known2;
34165 break;
34166 }
34167 case X86ISD::PSADBW: {
34168 assert(VT.getScalarType() == MVT::i64 &&((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34170, __PRETTY_FUNCTION__))
34169 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34170, __PRETTY_FUNCTION__))
34170 "Unexpected PSADBW types")((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34170, __PRETTY_FUNCTION__))
;
34171
34172 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
34173 Known.Zero.setBitsFrom(16);
34174 break;
34175 }
34176 case X86ISD::CMOV: {
34177 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
34178 // If we don't know any bits, early out.
34179 if (Known.isUnknown())
34180 break;
34181 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
34182
34183 // Only known if known in both the LHS and RHS.
34184 Known = KnownBits::commonBits(Known, Known2);
34185 break;
34186 }
34187 case X86ISD::BEXTR:
34188 case X86ISD::BEXTRI: {
34189 SDValue Op0 = Op.getOperand(0);
34190 SDValue Op1 = Op.getOperand(1);
34191
34192 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
34193 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
34194 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
34195
34196 // If the length is 0, the result is 0.
34197 if (Length == 0) {
34198 Known.setAllZero();
34199 break;
34200 }
34201
34202 if ((Shift + Length) <= BitWidth) {
34203 Known = DAG.computeKnownBits(Op0, Depth + 1);
34204 Known = Known.extractBits(Length, Shift);
34205 Known = Known.zextOrTrunc(BitWidth);
34206 }
34207 }
34208 break;
34209 }
34210 case X86ISD::PDEP: {
34211 KnownBits Known2;
34212 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34213 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34214 // Zeros are retained from the mask operand. But not ones.
34215 Known.One.clearAllBits();
34216 // The result will have at least as many trailing zeros as the non-mask
34217 // operand since bits can only map to the same or higher bit position.
34218 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
34219 break;
34220 }
34221 case X86ISD::PEXT: {
34222 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34223 // The result has as many leading zeros as the number of zeroes in the mask.
34224 unsigned Count = Known.Zero.countPopulation();
34225 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
34226 Known.One.clearAllBits();
34227 break;
34228 }
34229 case X86ISD::VTRUNC:
34230 case X86ISD::VTRUNCS:
34231 case X86ISD::VTRUNCUS:
34232 case X86ISD::CVTSI2P:
34233 case X86ISD::CVTUI2P:
34234 case X86ISD::CVTP2SI:
34235 case X86ISD::CVTP2UI:
34236 case X86ISD::MCVTP2SI:
34237 case X86ISD::MCVTP2UI:
34238 case X86ISD::CVTTP2SI:
34239 case X86ISD::CVTTP2UI:
34240 case X86ISD::MCVTTP2SI:
34241 case X86ISD::MCVTTP2UI:
34242 case X86ISD::MCVTSI2P:
34243 case X86ISD::MCVTUI2P:
34244 case X86ISD::VFPROUND:
34245 case X86ISD::VMFPROUND:
34246 case X86ISD::CVTPS2PH:
34247 case X86ISD::MCVTPS2PH: {
34248 // Truncations/Conversions - upper elements are known zero.
34249 EVT SrcVT = Op.getOperand(0).getValueType();
34250 if (SrcVT.isVector()) {
34251 unsigned NumSrcElts = SrcVT.getVectorNumElements();
34252 if (NumElts > NumSrcElts &&
34253 DemandedElts.countTrailingZeros() >= NumSrcElts)
34254 Known.setAllZero();
34255 }
34256 break;
34257 }
34258 case X86ISD::STRICT_CVTTP2SI:
34259 case X86ISD::STRICT_CVTTP2UI:
34260 case X86ISD::STRICT_CVTSI2P:
34261 case X86ISD::STRICT_CVTUI2P:
34262 case X86ISD::STRICT_VFPROUND:
34263 case X86ISD::STRICT_CVTPS2PH: {
34264 // Strict Conversions - upper elements are known zero.
34265 EVT SrcVT = Op.getOperand(1).getValueType();
34266 if (SrcVT.isVector()) {
34267 unsigned NumSrcElts = SrcVT.getVectorNumElements();
34268 if (NumElts > NumSrcElts &&
34269 DemandedElts.countTrailingZeros() >= NumSrcElts)
34270 Known.setAllZero();
34271 }
34272 break;
34273 }
34274 case X86ISD::MOVQ2DQ: {
34275 // Move from MMX to XMM. Upper half of XMM should be 0.
34276 if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
34277 Known.setAllZero();
34278 break;
34279 }
34280 }
34281
34282 // Handle target shuffles.
34283 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
34284 if (isTargetShuffle(Opc)) {
34285 bool IsUnary;
34286 SmallVector<int, 64> Mask;
34287 SmallVector<SDValue, 2> Ops;
34288 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
34289 IsUnary)) {
34290 unsigned NumOps = Ops.size();
34291 unsigned NumElts = VT.getVectorNumElements();
34292 if (Mask.size() == NumElts) {
34293 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
34294 Known.Zero.setAllBits(); Known.One.setAllBits();
34295 for (unsigned i = 0; i != NumElts; ++i) {
34296 if (!DemandedElts[i])
34297 continue;
34298 int M = Mask[i];
34299 if (M == SM_SentinelUndef) {
34300 // For UNDEF elements, we don't know anything about the common state
34301 // of the shuffle result.
34302 Known.resetAll();
34303 break;
34304 } else if (M == SM_SentinelZero) {
34305 Known.One.clearAllBits();
34306 continue;
34307 }
34308 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34309, __PRETTY_FUNCTION__))
34309 "Shuffle index out of range")((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34309, __PRETTY_FUNCTION__))
;
34310
34311 unsigned OpIdx = (unsigned)M / NumElts;
34312 unsigned EltIdx = (unsigned)M % NumElts;
34313 if (Ops[OpIdx].getValueType() != VT) {
34314 // TODO - handle target shuffle ops with different value types.
34315 Known.resetAll();
34316 break;
34317 }
34318 DemandedOps[OpIdx].setBit(EltIdx);
34319 }
34320 // Known bits are the values that are shared by every demanded element.
34321 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
34322 if (!DemandedOps[i])
34323 continue;
34324 KnownBits Known2 =
34325 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
34326 Known = KnownBits::commonBits(Known, Known2);
34327 }
34328 }
34329 }
34330 }
34331}
34332
34333unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
34334 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
34335 unsigned Depth) const {
34336 EVT VT = Op.getValueType();
34337 unsigned VTBits = VT.getScalarSizeInBits();
34338 unsigned Opcode = Op.getOpcode();
34339 switch (Opcode) {
34340 case X86ISD::SETCC_CARRY:
34341 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
34342 return VTBits;
34343
34344 case X86ISD::VTRUNC: {
34345 SDValue Src = Op.getOperand(0);
34346 MVT SrcVT = Src.getSimpleValueType();
34347 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
34348 assert(VTBits < NumSrcBits && "Illegal truncation input type")((VTBits < NumSrcBits && "Illegal truncation input type"
) ? static_cast<void> (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34348, __PRETTY_FUNCTION__))
;
34349 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
34350 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
34351 if (Tmp > (NumSrcBits - VTBits))
34352 return Tmp - (NumSrcBits - VTBits);
34353 return 1;
34354 }
34355
34356 case X86ISD::PACKSS: {
34357 // PACKSS is just a truncation if the sign bits extend to the packed size.
34358 APInt DemandedLHS, DemandedRHS;
34359 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
34360 DemandedRHS);
34361
34362 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
34363 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
34364 if (!!DemandedLHS)
34365 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
34366 if (!!DemandedRHS)
34367 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
34368 unsigned Tmp = std::min(Tmp0, Tmp1);
34369 if (Tmp > (SrcBits - VTBits))
34370 return Tmp - (SrcBits - VTBits);
34371 return 1;
34372 }
34373
34374 case X86ISD::VSHLI: {
34375 SDValue Src = Op.getOperand(0);
34376 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
34377 if (ShiftVal.uge(VTBits))
34378 return VTBits; // Shifted all bits out --> zero.
34379 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
34380 if (ShiftVal.uge(Tmp))
34381 return 1; // Shifted all sign bits out --> unknown.
34382 return Tmp - ShiftVal.getZExtValue();
34383 }
34384
34385 case X86ISD::VSRAI: {
34386 SDValue Src = Op.getOperand(0);
34387 APInt ShiftVal = Op.getConstantOperandAPInt(1);
34388 if (ShiftVal.uge(VTBits - 1))
34389 return VTBits; // Sign splat.
34390 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
34391 ShiftVal += Tmp;
34392 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
34393 }
34394
34395 case X86ISD::PCMPGT:
34396 case X86ISD::PCMPEQ:
34397 case X86ISD::CMPP:
34398 case X86ISD::VPCOM:
34399 case X86ISD::VPCOMU:
34400 // Vector compares return zero/all-bits result values.
34401 return VTBits;
34402
34403 case X86ISD::ANDNP: {
34404 unsigned Tmp0 =
34405 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
34406 if (Tmp0 == 1) return 1; // Early out.
34407 unsigned Tmp1 =
34408 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
34409 return std::min(Tmp0, Tmp1);
34410 }
34411
34412 case X86ISD::CMOV: {
34413 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
34414 if (Tmp0 == 1) return 1; // Early out.
34415 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
34416 return std::min(Tmp0, Tmp1);
34417 }
34418 }
34419
34420 // Handle target shuffles.
34421 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
34422 if (isTargetShuffle(Opcode)) {
34423 bool IsUnary;
34424 SmallVector<int, 64> Mask;
34425 SmallVector<SDValue, 2> Ops;
34426 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
34427 IsUnary)) {
34428 unsigned NumOps = Ops.size();
34429 unsigned NumElts = VT.getVectorNumElements();
34430 if (Mask.size() == NumElts) {
34431 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
34432 for (unsigned i = 0; i != NumElts; ++i) {
34433 if (!DemandedElts[i])
34434 continue;
34435 int M = Mask[i];
34436 if (M == SM_SentinelUndef) {
34437 // For UNDEF elements, we don't know anything about the common state
34438 // of the shuffle result.
34439 return 1;
34440 } else if (M == SM_SentinelZero) {
34441 // Zero = all sign bits.
34442 continue;
34443 }
34444 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34445, __PRETTY_FUNCTION__))
34445 "Shuffle index out of range")((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34445, __PRETTY_FUNCTION__))
;
34446
34447 unsigned OpIdx = (unsigned)M / NumElts;
34448 unsigned EltIdx = (unsigned)M % NumElts;
34449 if (Ops[OpIdx].getValueType() != VT) {
34450 // TODO - handle target shuffle ops with different value types.
34451 return 1;
34452 }
34453 DemandedOps[OpIdx].setBit(EltIdx);
34454 }
34455 unsigned Tmp0 = VTBits;
34456 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
34457 if (!DemandedOps[i])
34458 continue;
34459 unsigned Tmp1 =
34460 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
34461 Tmp0 = std::min(Tmp0, Tmp1);
34462 }
34463 return Tmp0;
34464 }
34465 }
34466 }
34467
34468 // Fallback case.
34469 return 1;
34470}
34471
34472SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
34473 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
34474 return N->getOperand(0);
34475 return N;
34476}
34477
34478// Helper to look for a normal load that can be narrowed into a vzload with the
34479// specified VT and memory VT. Returns SDValue() on failure.
34480static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
34481 SelectionDAG &DAG) {
34482 // Can't if the load is volatile or atomic.
34483 if (!LN->isSimple())
34484 return SDValue();
34485
34486 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
34487 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
34488 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
34489 LN->getPointerInfo(), LN->getOriginalAlign(),
34490 LN->getMemOperand()->getFlags());
34491}
34492
34493// Attempt to match a combined shuffle mask against supported unary shuffle
34494// instructions.
34495// TODO: Investigate sharing more of this with shuffle lowering.
34496static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
34497 bool AllowFloatDomain, bool AllowIntDomain,
34498 SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
34499 const X86Subtarget &Subtarget, unsigned &Shuffle,
34500 MVT &SrcVT, MVT &DstVT) {
34501 unsigned NumMaskElts = Mask.size();
34502 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
34503
34504 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
34505 if (MaskEltSize == 32 && Mask[0] == 0) {
34506 if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
34507 Shuffle = X86ISD::VZEXT_MOVL;
34508 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
34509 return true;
34510 }
34511 if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
34512 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
34513 Shuffle = X86ISD::VZEXT_MOVL;
34514 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
34515 return true;
34516 }
34517 }
34518
34519 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
34520 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
34521 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
34522 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
34523 unsigned MaxScale = 64 / MaskEltSize;
34524 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
34525 bool MatchAny = true;
34526 bool MatchZero = true;
34527 unsigned NumDstElts = NumMaskElts / Scale;
34528 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
34529 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
34530 MatchAny = MatchZero = false;
34531 break;
34532 }
34533 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
34534 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
34535 }
34536 if (MatchAny || MatchZero) {
34537 assert(MatchZero && "Failed to match zext but matched aext?")((MatchZero && "Failed to match zext but matched aext?"
) ? static_cast<void> (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34537, __PRETTY_FUNCTION__))
;
34538 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
34539 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
34540 MVT::getIntegerVT(MaskEltSize);
34541 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
34542
34543 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
34544 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
34545
34546 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
34547 if (SrcVT.getVectorNumElements() != NumDstElts)
34548 Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
34549
34550 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
34551 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
34552 return true;
34553 }
34554 }
34555 }
34556
34557 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
34558 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
34559 isUndefOrEqual(Mask[0], 0) &&
34560 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
34561 Shuffle = X86ISD::VZEXT_MOVL;
34562 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
34563 return true;
34564 }
34565
34566 // Check if we have SSE3 which will let us use MOVDDUP etc. The
34567 // instructions are no slower than UNPCKLPD but has the option to
34568 // fold the input operand into even an unaligned memory load.
34569 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
34570 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
34571 Shuffle = X86ISD::MOVDDUP;
34572 SrcVT = DstVT = MVT::v2f64;
34573 return true;
34574 }
34575 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
34576 Shuffle = X86ISD::MOVSLDUP;
34577 SrcVT = DstVT = MVT::v4f32;
34578 return true;
34579 }
34580 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
34581 Shuffle = X86ISD::MOVSHDUP;
34582 SrcVT = DstVT = MVT::v4f32;
34583 return true;
34584 }
34585 }
34586
34587 if (MaskVT.is256BitVector() && AllowFloatDomain) {
34588 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")((Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34588, __PRETTY_FUNCTION__))
;
34589 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
34590 Shuffle = X86ISD::MOVDDUP;
34591 SrcVT = DstVT = MVT::v4f64;
34592 return true;
34593 }
34594 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
34595 Shuffle = X86ISD::MOVSLDUP;
34596 SrcVT = DstVT = MVT::v8f32;
34597 return true;
34598 }
34599 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
34600 Shuffle = X86ISD::MOVSHDUP;
34601 SrcVT = DstVT = MVT::v8f32;
34602 return true;
34603 }
34604 }
34605
34606 if (MaskVT.is512BitVector() && AllowFloatDomain) {
34607 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34608, __PRETTY_FUNCTION__))
34608 "AVX512 required for 512-bit vector shuffles")((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34608, __PRETTY_FUNCTION__))
;
34609 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
34610 Shuffle = X86ISD::MOVDDUP;
34611 SrcVT = DstVT = MVT::v8f64;
34612 return true;
34613 }
34614 if (isTargetShuffleEquivalent(
34615 MaskVT, Mask,
34616 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
34617 Shuffle = X86ISD::MOVSLDUP;
34618 SrcVT = DstVT = MVT::v16f32;
34619 return true;
34620 }
34621 if (isTargetShuffleEquivalent(
34622 MaskVT, Mask,
34623 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
34624 Shuffle = X86ISD::MOVSHDUP;
34625 SrcVT = DstVT = MVT::v16f32;
34626 return true;
34627 }
34628 }
34629
34630 return false;
34631}
34632
34633// Attempt to match a combined shuffle mask against supported unary immediate
34634// permute instructions.
34635// TODO: Investigate sharing more of this with shuffle lowering.
34636static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
34637 const APInt &Zeroable,
34638 bool AllowFloatDomain, bool AllowIntDomain,
34639 const X86Subtarget &Subtarget,
34640 unsigned &Shuffle, MVT &ShuffleVT,
34641 unsigned &PermuteImm) {
34642 unsigned NumMaskElts = Mask.size();
34643 unsigned InputSizeInBits = MaskVT.getSizeInBits();
34644 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
34645 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
34646 bool ContainsZeros = isAnyZero(Mask);
34647
34648 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
34649 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
34650 // Check for lane crossing permutes.
34651 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
34652 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
34653 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
34654 Shuffle = X86ISD::VPERMI;
34655 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
34656 PermuteImm = getV4X86ShuffleImm(Mask);
34657 return true;
34658 }
34659 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
34660 SmallVector<int, 4> RepeatedMask;
34661 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
34662 Shuffle = X86ISD::VPERMI;
34663 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
34664 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
34665 return true;
34666 }
34667 }
34668 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
34669 // VPERMILPD can permute with a non-repeating shuffle.
34670 Shuffle = X86ISD::VPERMILPI;
34671 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
34672 PermuteImm = 0;
34673 for (int i = 0, e = Mask.size(); i != e; ++i) {
34674 int M = Mask[i];
34675 if (M == SM_SentinelUndef)
34676 continue;
34677 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")((((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? static_cast<void> (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34677, __PRETTY_FUNCTION__))
;
34678 PermuteImm |= (M & 1) << i;
34679 }
34680 return true;
34681 }
34682 }
34683
34684 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
34685 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
34686 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
34687 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
34688 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
34689 SmallVector<int, 4> RepeatedMask;
34690 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
34691 // Narrow the repeated mask to create 32-bit element permutes.
34692 SmallVector<int, 4> WordMask = RepeatedMask;
34693 if (MaskScalarSizeInBits == 64)
34694 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
34695
34696 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
34697 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
34698 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
34699 PermuteImm = getV4X86ShuffleImm(WordMask);
34700 return true;
34701 }
34702 }
34703
34704 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
34705 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
34706 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
34707 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
34708 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
34709 SmallVector<int, 4> RepeatedMask;
34710 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
34711 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
34712 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
34713
34714 // PSHUFLW: permute lower 4 elements only.
34715 if (isUndefOrInRange(LoMask, 0, 4) &&
34716 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
34717 Shuffle = X86ISD::PSHUFLW;
34718 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
34719 PermuteImm = getV4X86ShuffleImm(LoMask);
34720 return true;
34721 }
34722
34723 // PSHUFHW: permute upper 4 elements only.
34724 if (isUndefOrInRange(HiMask, 4, 8) &&
34725 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
34726 // Offset the HiMask so that we can create the shuffle immediate.
34727 int OffsetHiMask[4];
34728 for (int i = 0; i != 4; ++i)
34729 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
34730
34731 Shuffle = X86ISD::PSHUFHW;
34732 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
34733 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
34734 return true;
34735 }
34736 }
34737 }
34738
34739 // Attempt to match against byte/bit shifts.
34740 if (AllowIntDomain &&
34741 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
34742 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
34743 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
34744 int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
34745 Mask, 0, Zeroable, Subtarget);
34746 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
34747 32 <= ShuffleVT.getScalarSizeInBits())) {
34748 PermuteImm = (unsigned)ShiftAmt;
34749 return true;
34750 }
34751 }
34752
34753 // Attempt to match against bit rotates.
34754 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
34755 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
34756 Subtarget.hasAVX512())) {
34757 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
34758 Subtarget, Mask);
34759 if (0 < RotateAmt) {
34760 Shuffle = X86ISD::VROTLI;
34761 PermuteImm = (unsigned)RotateAmt;
34762 return true;
34763 }
34764 }
34765
34766 return false;
34767}
34768
34769// Attempt to match a combined unary shuffle mask against supported binary
34770// shuffle instructions.
34771// TODO: Investigate sharing more of this with shuffle lowering.
34772static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
34773 bool AllowFloatDomain, bool AllowIntDomain,
34774 SDValue &V1, SDValue &V2, const SDLoc &DL,
34775 SelectionDAG &DAG, const X86Subtarget &Subtarget,
34776 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
34777 bool IsUnary) {
34778 unsigned NumMaskElts = Mask.size();
34779 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
34780
34781 if (MaskVT.is128BitVector()) {
34782 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
34783 V2 = V1;
34784 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
34785 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
34786 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
34787 return true;
34788 }
34789 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
34790 V2 = V1;
34791 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
34792 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
34793 return true;
34794 }
34795 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
34796 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
34797 std::swap(V1, V2);
34798 Shuffle = X86ISD::MOVSD;
34799 SrcVT = DstVT = MVT::v2f64;
34800 return true;
34801 }
34802 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
34803 (AllowFloatDomain || !Subtarget.hasSSE41())) {
34804 Shuffle = X86ISD::MOVSS;
34805 SrcVT = DstVT = MVT::v4f32;
34806 return true;
34807 }
34808 }
34809
34810 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
34811 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
34812 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
34813 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
34814 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
34815 Subtarget)) {
34816 DstVT = MaskVT;
34817 return true;
34818 }
34819 }
34820
34821 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
34822 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
34823 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
34824 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
34825 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
34826 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
34827 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
34828 Subtarget)) {
34829 SrcVT = DstVT = MaskVT;
34830 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
34831 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
34832 return true;
34833 }
34834 }
34835
34836 // Attempt to match against a OR if we're performing a blend shuffle and the
34837 // non-blended source element is zero in each case.
34838 if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
34839 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
34840 bool IsBlend = true;
34841 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
34842 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
34843 unsigned Scale1 = NumV1Elts / NumMaskElts;
34844 unsigned Scale2 = NumV2Elts / NumMaskElts;
34845 APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
34846 APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
34847 for (unsigned i = 0; i != NumMaskElts; ++i) {
34848 int M = Mask[i];
34849 if (M == SM_SentinelUndef)
34850 continue;
34851 if (M == SM_SentinelZero) {
34852 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
34853 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
34854 continue;
34855 }
34856 if (M == (int)i) {
34857 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
34858 continue;
34859 }
34860 if (M == (int)(i + NumMaskElts)) {
34861 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
34862 continue;
34863 }
34864 IsBlend = false;
34865 break;
34866 }
34867 if (IsBlend &&
34868 DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
34869 DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
34870 Shuffle = ISD::OR;
34871 SrcVT = DstVT = MaskVT.changeTypeToInteger();
34872 return true;
34873 }
34874 }
34875
34876 return false;
34877}
34878
34879static bool matchBinaryPermuteShuffle(
34880 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
34881 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
34882 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
34883 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
34884 unsigned NumMaskElts = Mask.size();
34885 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
34886
34887 // Attempt to match against VALIGND/VALIGNQ rotate.
34888 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
34889 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
34890 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
34891 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
34892 if (!isAnyZero(Mask)) {
34893 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
34894 if (0 < Rotation) {
34895 Shuffle = X86ISD::VALIGN;
34896 if (EltSizeInBits == 64)
34897 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
34898 else
34899 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
34900 PermuteImm = Rotation;
34901 return true;
34902 }
34903 }
34904 }
34905
34906 // Attempt to match against PALIGNR byte rotate.
34907 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
34908 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
34909 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
34910 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
34911 if (0 < ByteRotation) {
34912 Shuffle = X86ISD::PALIGNR;
34913 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
34914 PermuteImm = ByteRotation;
34915 return true;
34916 }
34917 }
34918
34919 // Attempt to combine to X86ISD::BLENDI.
34920 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
34921 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
34922 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
34923 uint64_t BlendMask = 0;
34924 bool ForceV1Zero = false, ForceV2Zero = false;
34925 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
34926 if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
34927 ForceV2Zero, BlendMask)) {
34928 if (MaskVT == MVT::v16i16) {
34929 // We can only use v16i16 PBLENDW if the lanes are repeated.
34930 SmallVector<int, 8> RepeatedMask;
34931 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
34932 RepeatedMask)) {
34933 assert(RepeatedMask.size() == 8 &&((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34934, __PRETTY_FUNCTION__))
34934 "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34934, __PRETTY_FUNCTION__))
;
34935 PermuteImm = 0;
34936 for (int i = 0; i < 8; ++i)
34937 if (RepeatedMask[i] >= 8)
34938 PermuteImm |= 1 << i;
34939 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
34940 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
34941 Shuffle = X86ISD::BLENDI;
34942 ShuffleVT = MaskVT;
34943 return true;
34944 }
34945 } else {
34946 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
34947 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
34948 PermuteImm = (unsigned)BlendMask;
34949 Shuffle = X86ISD::BLENDI;
34950 ShuffleVT = MaskVT;
34951 return true;
34952 }
34953 }
34954 }
34955
34956 // Attempt to combine to INSERTPS, but only if it has elements that need to
34957 // be set to zero.
34958 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
34959 MaskVT.is128BitVector() && isAnyZero(Mask) &&
34960 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
34961 Shuffle = X86ISD::INSERTPS;
34962 ShuffleVT = MVT::v4f32;
34963 return true;
34964 }
34965
34966 // Attempt to combine to SHUFPD.
34967 if (AllowFloatDomain && EltSizeInBits == 64 &&
34968 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
34969 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
34970 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
34971 bool ForceV1Zero = false, ForceV2Zero = false;
34972 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
34973 PermuteImm, Mask, Zeroable)) {
34974 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
34975 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
34976 Shuffle = X86ISD::SHUFP;
34977 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
34978 return true;
34979 }
34980 }
34981
34982 // Attempt to combine to SHUFPS.
34983 if (AllowFloatDomain && EltSizeInBits == 32 &&
34984 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
34985 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
34986 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
34987 SmallVector<int, 4> RepeatedMask;
34988 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
34989 // Match each half of the repeated mask, to determine if its just
34990 // referencing one of the vectors, is zeroable or entirely undef.
34991 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
34992 int M0 = RepeatedMask[Offset];
34993 int M1 = RepeatedMask[Offset + 1];
34994
34995 if (isUndefInRange(RepeatedMask, Offset, 2)) {
34996 return DAG.getUNDEF(MaskVT);
34997 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
34998 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
34999 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
35000 return getZeroVector(MaskVT, Subtarget, DAG, DL);
35001 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
35002 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35003 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35004 return V1;
35005 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
35006 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35007 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35008 return V2;
35009 }
35010
35011 return SDValue();
35012 };
35013
35014 int ShufMask[4] = {-1, -1, -1, -1};
35015 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
35016 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
35017
35018 if (Lo && Hi) {
35019 V1 = Lo;
35020 V2 = Hi;
35021 Shuffle = X86ISD::SHUFP;
35022 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
35023 PermuteImm = getV4X86ShuffleImm(ShufMask);
35024 return true;
35025 }
35026 }
35027 }
35028
35029 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
35030 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35031 MaskVT.is128BitVector() &&
35032 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35033 Shuffle = X86ISD::INSERTPS;
35034 ShuffleVT = MVT::v4f32;
35035 return true;
35036 }
35037
35038 return false;
35039}
35040
35041static SDValue combineX86ShuffleChainWithExtract(
35042 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
35043 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
35044 const X86Subtarget &Subtarget);
35045
35046/// Combine an arbitrary chain of shuffles into a single instruction if
35047/// possible.
35048///
35049/// This is the leaf of the recursive combine below. When we have found some
35050/// chain of single-use x86 shuffle instructions and accumulated the combined
35051/// shuffle mask represented by them, this will try to pattern match that mask
35052/// into either a single instruction if there is a special purpose instruction
35053/// for this operation, or into a PSHUFB instruction which is a fully general
35054/// instruction but should only be used to replace chains over a certain depth.
35055static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35056 ArrayRef<int> BaseMask, int Depth,
35057 bool HasVariableMask,
35058 bool AllowVariableMask, SelectionDAG &DAG,
35059 const X86Subtarget &Subtarget) {
35060 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")((!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? static_cast<void> (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35060, __PRETTY_FUNCTION__))
;
35061 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"
) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35062, __PRETTY_FUNCTION__))
35062 "Unexpected number of shuffle inputs!")(((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"
) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35062, __PRETTY_FUNCTION__))
;
35063
35064 MVT RootVT = Root.getSimpleValueType();
35065 unsigned RootSizeInBits = RootVT.getSizeInBits();
35066 unsigned NumRootElts = RootVT.getVectorNumElements();
35067
35068 // Canonicalize shuffle input op to the requested type.
35069 // TODO: Support cases where Op is smaller than VT.
35070 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
35071 return DAG.getBitcast(VT, Op);
35072 };
35073
35074 // Find the inputs that enter the chain. Note that multiple uses are OK
35075 // here, we're not going to remove the operands we find.
35076 bool UnaryShuffle = (Inputs.size() == 1);
35077 SDValue V1 = peekThroughBitcasts(Inputs[0]);
35078 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
35079 : peekThroughBitcasts(Inputs[1]));
35080
35081 MVT VT1 = V1.getSimpleValueType();
35082 MVT VT2 = V2.getSimpleValueType();
35083 assert(VT1.getSizeInBits() == RootSizeInBits &&((VT1.getSizeInBits() == RootSizeInBits && VT2.getSizeInBits
() == RootSizeInBits && "Vector size mismatch") ? static_cast
<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootSizeInBits && VT2.getSizeInBits() == RootSizeInBits && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35084, __PRETTY_FUNCTION__))
35084 VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch")((VT1.getSizeInBits() == RootSizeInBits && VT2.getSizeInBits
() == RootSizeInBits && "Vector size mismatch") ? static_cast
<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootSizeInBits && VT2.getSizeInBits() == RootSizeInBits && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35084, __PRETTY_FUNCTION__))
;
35085
35086 SDLoc DL(Root);
35087 SDValue Res;
35088
35089 unsigned NumBaseMaskElts = BaseMask.size();
35090 if (NumBaseMaskElts == 1) {
35091 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")((BaseMask[0] == 0 && "Invalid shuffle index found!")
? static_cast<void> (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35091, __PRETTY_FUNCTION__))
;
35092 return CanonicalizeShuffleInput(RootVT, V1);
35093 }
35094
35095 bool OptForSize = DAG.shouldOptForSize();
35096 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
35097 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
35098 (RootVT.isFloatingPoint() && Depth >= 1) ||
35099 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
35100
35101 // Don't combine if we are a AVX512/EVEX target and the mask element size
35102 // is different from the root element size - this would prevent writemasks
35103 // from being reused.
35104 bool IsMaskedShuffle = false;
35105 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
35106 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
35107 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
35108 IsMaskedShuffle = true;
35109 }
35110 }
35111
35112 // If we are shuffling a broadcast (and not introducing zeros) then
35113 // we can just use the broadcast directly. This works for smaller broadcast
35114 // elements as well as they already repeat across each mask element
35115 if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
35116 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35117 V1.getValueSizeInBits() >= RootSizeInBits) {
35118 return CanonicalizeShuffleInput(RootVT, V1);
35119 }
35120
35121 // Attempt to match a subvector broadcast.
35122 // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
35123 if (UnaryShuffle &&
35124 (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {
35125 if (isUndefOrEqual(BaseMask, 0)) {
35126 SDValue Src = Inputs[0];
35127 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
35128 Src.getOperand(0).isUndef() &&
35129 Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
35130 MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
35131 return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
35132 Src.getValueType(),
35133 Src.getOperand(1)));
35134 }
35135 }
35136 }
35137
35138 // Handle 128/256-bit lane shuffles of 512-bit vectors.
35139 if (RootVT.is512BitVector() &&
35140 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
35141 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
35142
35143 // If the upper subvectors are zeroable, then an extract+insert is more
35144 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
35145 // to zero the upper subvectors.
35146 if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
35147 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35148 return SDValue(); // Nothing to do!
35149 assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&((isInRange(BaseMask[0], 0, NumBaseMaskElts) && "Unexpected lane shuffle"
) ? static_cast<void> (0) : __assert_fail ("isInRange(BaseMask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35150, __PRETTY_FUNCTION__))
35150 "Unexpected lane shuffle")((isInRange(BaseMask[0], 0, NumBaseMaskElts) && "Unexpected lane shuffle"
) ? static_cast<void> (0) : __assert_fail ("isInRange(BaseMask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35150, __PRETTY_FUNCTION__))
;
35151 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
35152 unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts);
35153 bool UseZero = isAnyZero(BaseMask);
35154 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
35155 Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
35156 return DAG.getBitcast(RootVT, Res);
35157 }
35158
35159 // Narrow shuffle mask to v4x128.
35160 SmallVector<int, 4> Mask;
35161 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size"
) ? static_cast<void> (0) : __assert_fail ("(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35161, __PRETTY_FUNCTION__))
;
35162 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
35163
35164 // Try to lower to vshuf64x2/vshuf32x4.
35165 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
35166 SDValue V1, SDValue V2, SelectionDAG &DAG) {
35167 unsigned PermMask = 0;
35168 // Insure elements came from the same Op.
35169 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
35170 for (int i = 0; i < 4; ++i) {
35171 assert(Mask[i] >= -1 && "Illegal shuffle sentinel value")((Mask[i] >= -1 && "Illegal shuffle sentinel value"
) ? static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35171, __PRETTY_FUNCTION__))
;
35172 if (Mask[i] < 0)
35173 continue;
35174
35175 SDValue Op = Mask[i] >= 4 ? V2 : V1;
35176 unsigned OpIndex = i / 2;
35177 if (Ops[OpIndex].isUndef())
35178 Ops[OpIndex] = Op;
35179 else if (Ops[OpIndex] != Op)
35180 return SDValue();
35181
35182 // Convert the 128-bit shuffle mask selection values into 128-bit
35183 // selection bits defined by a vshuf64x2 instruction's immediate control
35184 // byte.
35185 PermMask |= (Mask[i] % 4) << (i * 2);
35186 }
35187
35188 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
35189 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
35190 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
35191 DAG.getTargetConstant(PermMask, DL, MVT::i8));
35192 };
35193
35194 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
35195 // doesn't work because our mask is for 128 bits and we don't have an MVT
35196 // to match that.
35197 bool PreferPERMQ =
35198 UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
35199 isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
35200 isUndefOrInRange(Mask[3], 2, 4) &&
35201 (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
35202 (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
35203
35204 if (!isAnyZero(Mask) && !PreferPERMQ) {
35205 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35206 return SDValue(); // Nothing to do!
35207 if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
35208 return DAG.getBitcast(RootVT, V);
35209 }
35210 }
35211
35212 // Handle 128-bit lane shuffles of 256-bit vectors.
35213 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
35214 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
35215
35216 // If the upper half is zeroable, then an extract+insert is more optimal
35217 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
35218 // zero the upper half.
35219 if (isUndefOrZero(BaseMask[1])) {
35220 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35221 return SDValue(); // Nothing to do!
35222 assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle")((isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle"
) ? static_cast<void> (0) : __assert_fail ("isInRange(BaseMask[0], 0, 2) && \"Unexpected lane shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35222, __PRETTY_FUNCTION__))
;
35223 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
35224 Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);
35225 Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
35226 DL, 256);
35227 return DAG.getBitcast(RootVT, Res);
35228 }
35229
35230 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
35231 return SDValue(); // Nothing to do!
35232
35233 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
35234 // we need to use the zeroing feature.
35235 // Prefer blends for sequential shuffles unless we are optimizing for size.
35236 if (UnaryShuffle &&
35237 !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
35238 (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
35239 unsigned PermMask = 0;
35240 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
35241 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
35242
35243 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
35244 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
35245 DAG.getUNDEF(ShuffleVT),
35246 DAG.getTargetConstant(PermMask, DL, MVT::i8));
35247 return DAG.getBitcast(RootVT, Res);
35248 }
35249
35250 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35251 return SDValue(); // Nothing to do!
35252
35253 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
35254 if (!UnaryShuffle && !IsMaskedShuffle) {
35255 assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&((llvm::all_of(BaseMask, [](int M) { return 0 <= M &&
M < 4; }) && "Unexpected shuffle sentinel value")
? static_cast<void> (0) : __assert_fail ("llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35256, __PRETTY_FUNCTION__))
35256 "Unexpected shuffle sentinel value")((llvm::all_of(BaseMask, [](int M) { return 0 <= M &&
M < 4; }) && "Unexpected shuffle sentinel value")
? static_cast<void> (0) : __assert_fail ("llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35256, __PRETTY_FUNCTION__))
;
35257 // Prefer blends to X86ISD::VPERM2X128.
35258 if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
35259 (BaseMask[0] == 2 && BaseMask[1] == 1))) {
35260 unsigned PermMask = 0;
35261 PermMask |= ((BaseMask[0] & 3) << 0);
35262 PermMask |= ((BaseMask[1] & 3) << 4);
35263
35264 SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
35265 SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
35266 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT,
35267 CanonicalizeShuffleInput(ShuffleVT, LHS),
35268 CanonicalizeShuffleInput(ShuffleVT, RHS),
35269 DAG.getTargetConstant(PermMask, DL, MVT::i8));
35270 return DAG.getBitcast(RootVT, Res);
35271 }
35272 }
35273 }
35274
35275 // For masks that have been widened to 128-bit elements or more,
35276 // narrow back down to 64-bit elements.
35277 SmallVector<int, 64> Mask;
35278 if (BaseMaskEltSizeInBits > 64) {
35279 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"
) ? static_cast<void> (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35279, __PRETTY_FUNCTION__))
;
35280 int MaskScale = BaseMaskEltSizeInBits / 64;
35281 narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
35282 } else {
35283 Mask.assign(BaseMask.begin(), BaseMask.end());
35284 }
35285
35286 // For masked shuffles, we're trying to match the root width for better
35287 // writemask folding, attempt to scale the mask.
35288 // TODO - variable shuffles might need this to be widened again.
35289 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
35290 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(((NumRootElts % Mask.size()) == 0 && "Illegal mask size"
) ? static_cast<void> (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35290, __PRETTY_FUNCTION__))
;
35291 int MaskScale = NumRootElts / Mask.size();
35292 SmallVector<int, 64> ScaledMask;
35293 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
35294 Mask = std::move(ScaledMask);
35295 }
35296
35297 unsigned NumMaskElts = Mask.size();
35298 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
35299
35300 // Determine the effective mask value type.
35301 FloatDomain &= (32 <= MaskEltSizeInBits);
35302 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
35303 : MVT::getIntegerVT(MaskEltSizeInBits);
35304 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
35305
35306 // Only allow legal mask types.
35307 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
35308 return SDValue();
35309
35310 // Attempt to match the mask against known shuffle patterns.
35311 MVT ShuffleSrcVT, ShuffleVT;
35312 unsigned Shuffle, PermuteImm;
35313
35314 // Which shuffle domains are permitted?
35315 // Permit domain crossing at higher combine depths.
35316 // TODO: Should we indicate which domain is preferred if both are allowed?
35317 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
35318 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
35319 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
35320
35321 // Determine zeroable mask elements.
35322 APInt KnownUndef, KnownZero;
35323 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
35324 APInt Zeroable = KnownUndef | KnownZero;
35325
35326 if (UnaryShuffle) {
35327 // Attempt to match against broadcast-from-vector.
35328 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
35329 if ((Subtarget.hasAVX2() ||
35330 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
35331 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
35332 if (isUndefOrEqual(Mask, 0)) {
35333 if (V1.getValueType() == MaskVT &&
35334 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35335 MayFoldLoad(V1.getOperand(0))) {
35336 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
35337 return SDValue(); // Nothing to do!
35338 Res = V1.getOperand(0);
35339 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
35340 return DAG.getBitcast(RootVT, Res);
35341 }
35342 if (Subtarget.hasAVX2()) {
35343 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
35344 return SDValue(); // Nothing to do!
35345 Res = CanonicalizeShuffleInput(MaskVT, V1);
35346 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
35347 return DAG.getBitcast(RootVT, Res);
35348 }
35349 }
35350 }
35351
35352 SDValue NewV1 = V1; // Save operand in case early exit happens.
35353 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
35354 DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
35355 ShuffleVT) &&
35356 (!IsMaskedShuffle ||
35357 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35358 if (Depth == 0 && Root.getOpcode() == Shuffle)
35359 return SDValue(); // Nothing to do!
35360 Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
35361 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
35362 return DAG.getBitcast(RootVT, Res);
35363 }
35364
35365 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
35366 AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
35367 PermuteImm) &&
35368 (!IsMaskedShuffle ||
35369 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35370 if (Depth == 0 && Root.getOpcode() == Shuffle)
35371 return SDValue(); // Nothing to do!
35372 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
35373 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
35374 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35375 return DAG.getBitcast(RootVT, Res);
35376 }
35377 }
35378
35379 // Attempt to combine to INSERTPS, but only if the inserted element has come
35380 // from a scalar.
35381 // TODO: Handle other insertions here as well?
35382 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
35383 Subtarget.hasSSE41() &&
35384 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
35385 if (MaskEltSizeInBits == 32) {
35386 SDValue SrcV1 = V1, SrcV2 = V2;
35387 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
35388 DAG) &&
35389 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
35390 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
35391 return SDValue(); // Nothing to do!
35392 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
35393 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
35394 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
35395 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35396 return DAG.getBitcast(RootVT, Res);
35397 }
35398 }
35399 if (MaskEltSizeInBits == 64 &&
35400 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
35401 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35402 V2.getScalarValueSizeInBits() <= 32) {
35403 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
35404 return SDValue(); // Nothing to do!
35405 PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
35406 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
35407 CanonicalizeShuffleInput(MVT::v4f32, V1),
35408 CanonicalizeShuffleInput(MVT::v4f32, V2),
35409 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35410 return DAG.getBitcast(RootVT, Res);
35411 }
35412 }
35413
35414 SDValue NewV1 = V1; // Save operands in case early exit happens.
35415 SDValue NewV2 = V2;
35416 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
35417 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
35418 ShuffleVT, UnaryShuffle) &&
35419 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35420 if (Depth == 0 && Root.getOpcode() == Shuffle)
35421 return SDValue(); // Nothing to do!
35422 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
35423 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
35424 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
35425 return DAG.getBitcast(RootVT, Res);
35426 }
35427
35428 NewV1 = V1; // Save operands in case early exit happens.
35429 NewV2 = V2;
35430 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
35431 AllowIntDomain, NewV1, NewV2, DL, DAG,
35432 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
35433 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
35434 if (Depth == 0 && Root.getOpcode() == Shuffle)
35435 return SDValue(); // Nothing to do!
35436 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
35437 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
35438 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
35439 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
35440 return DAG.getBitcast(RootVT, Res);
35441 }
35442
35443 // Typically from here on, we need an integer version of MaskVT.
35444 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
35445 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
35446
35447 // Annoyingly, SSE4A instructions don't map into the above match helpers.
35448 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
35449 uint64_t BitLen, BitIdx;
35450 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
35451 Zeroable)) {
35452 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
35453 return SDValue(); // Nothing to do!
35454 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
35455 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
35456 DAG.getTargetConstant(BitLen, DL, MVT::i8),
35457 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
35458 return DAG.getBitcast(RootVT, Res);
35459 }
35460
35461 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
35462 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
35463 return SDValue(); // Nothing to do!
35464 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
35465 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
35466 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
35467 DAG.getTargetConstant(BitLen, DL, MVT::i8),
35468 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
35469 return DAG.getBitcast(RootVT, Res);
35470 }
35471 }
35472
35473 // Match shuffle against TRUNCATE patterns.
35474 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
35475 // Match against a VTRUNC instruction, accounting for src/dst sizes.
35476 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
35477 Subtarget)) {
35478 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
35479 ShuffleSrcVT.getVectorNumElements();
35480 unsigned Opc =
35481 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
35482 if (Depth == 0 && Root.getOpcode() == Opc)
35483 return SDValue(); // Nothing to do!
35484 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
35485 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
35486 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
35487 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
35488 return DAG.getBitcast(RootVT, Res);
35489 }
35490
35491 // Do we need a more general binary truncation pattern?
35492 if (RootSizeInBits < 512 &&
35493 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
35494 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
35495 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
35496 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
35497 if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
35498 return SDValue(); // Nothing to do!
35499 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
35500 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
35501 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
35502 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
35503 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
35504 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
35505 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
35506 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
35507 return DAG.getBitcast(RootVT, Res);
35508 }
35509 }
35510
35511 // Don't try to re-form single instruction chains under any circumstances now
35512 // that we've done encoding canonicalization for them.
35513 if (Depth < 1)
35514 return SDValue();
35515
35516 // Depth threshold above which we can efficiently use variable mask shuffles.
35517 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
35518 AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
35519 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
35520 // higher depth before combining them.
35521 bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
35522
35523 bool MaskContainsZeros = isAnyZero(Mask);
35524
35525 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
35526 // If we have a single input lane-crossing shuffle then lower to VPERMV.
35527 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) {
35528 if (Subtarget.hasAVX2() &&
35529 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
35530 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
35531 Res = CanonicalizeShuffleInput(MaskVT, V1);
35532 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
35533 return DAG.getBitcast(RootVT, Res);
35534 }
35535 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
35536 if ((Subtarget.hasAVX512() &&
35537 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
35538 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
35539 (Subtarget.hasBWI() &&
35540 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
35541 (Subtarget.hasVBMI() &&
35542 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
35543 V1 = CanonicalizeShuffleInput(MaskVT, V1);
35544 V2 = DAG.getUNDEF(MaskVT);
35545 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
35546 return DAG.getBitcast(RootVT, Res);
35547 }
35548 }
35549
35550 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
35551 // vector as the second source (non-VLX will pad to 512-bit shuffles).
35552 if (UnaryShuffle && AllowVariableMask &&
35553 ((Subtarget.hasAVX512() &&
35554 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
35555 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
35556 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
35557 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
35558 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
35559 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
35560 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
35561 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
35562 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
35563 for (unsigned i = 0; i != NumMaskElts; ++i)
35564 if (Mask[i] == SM_SentinelZero)
35565 Mask[i] = NumMaskElts + i;
35566 V1 = CanonicalizeShuffleInput(MaskVT, V1);
35567 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
35568 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
35569 return DAG.getBitcast(RootVT, Res);
35570 }
35571
35572 // If that failed and either input is extracted then try to combine as a
35573 // shuffle with the larger type.
35574 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
35575 Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
35576 DAG, Subtarget))
35577 return WideShuffle;
35578
35579 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
35580 // (non-VLX will pad to 512-bit shuffles).
35581 if (AllowVariableMask && !MaskContainsZeros &&
35582 ((Subtarget.hasAVX512() &&
35583 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
35584 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
35585 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
35586 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
35587 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
35588 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
35589 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
35590 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
35591 V1 = CanonicalizeShuffleInput(MaskVT, V1);
35592 V2 = CanonicalizeShuffleInput(MaskVT, V2);
35593 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
35594 return DAG.getBitcast(RootVT, Res);
35595 }
35596 return SDValue();
35597 }
35598
35599 // See if we can combine a single input shuffle with zeros to a bit-mask,
35600 // which is much simpler than any shuffle.
35601 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
35602 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
35603 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
35604 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
35605 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
35606 APInt UndefElts(NumMaskElts, 0);
35607 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
35608 for (unsigned i = 0; i != NumMaskElts; ++i) {
35609 int M = Mask[i];
35610 if (M == SM_SentinelUndef) {
35611 UndefElts.setBit(i);
35612 continue;
35613 }
35614 if (M == SM_SentinelZero)
35615 continue;
35616 EltBits[i] = AllOnes;
35617 }
35618 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
35619 Res = CanonicalizeShuffleInput(MaskVT, V1);
35620 unsigned AndOpcode =
35621 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
35622 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
35623 return DAG.getBitcast(RootVT, Res);
35624 }
35625
35626 // If we have a single input shuffle with different shuffle patterns in the
35627 // the 128-bit lanes use the variable mask to VPERMILPS.
35628 // TODO Combine other mask types at higher depths.
35629 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
35630 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
35631 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
35632 SmallVector<SDValue, 16> VPermIdx;
35633 for (int M : Mask) {
35634 SDValue Idx =
35635 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
35636 VPermIdx.push_back(Idx);
35637 }
35638 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
35639 Res = CanonicalizeShuffleInput(MaskVT, V1);
35640 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
35641 return DAG.getBitcast(RootVT, Res);
35642 }
35643
35644 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
35645 // to VPERMIL2PD/VPERMIL2PS.
35646 if (AllowVariableMask && Subtarget.hasXOP() &&
35647 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
35648 MaskVT == MVT::v8f32)) {
35649 // VPERMIL2 Operation.
35650 // Bits[3] - Match Bit.
35651 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
35652 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
35653 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
35654 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
35655 SmallVector<int, 8> VPerm2Idx;
35656 unsigned M2ZImm = 0;
35657 for (int M : Mask) {
35658 if (M == SM_SentinelUndef) {
35659 VPerm2Idx.push_back(-1);
35660 continue;
35661 }
35662 if (M == SM_SentinelZero) {
35663 M2ZImm = 2;
35664 VPerm2Idx.push_back(8);
35665 continue;
35666 }
35667 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
35668 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
35669 VPerm2Idx.push_back(Index);
35670 }
35671 V1 = CanonicalizeShuffleInput(MaskVT, V1);
35672 V2 = CanonicalizeShuffleInput(MaskVT, V2);
35673 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
35674 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
35675 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
35676 return DAG.getBitcast(RootVT, Res);
35677 }
35678
35679 // If we have 3 or more shuffle instructions or a chain involving a variable
35680 // mask, we can replace them with a single PSHUFB instruction profitably.
35681 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
35682 // instructions, but in practice PSHUFB tends to be *very* fast so we're
35683 // more aggressive.
35684 if (UnaryShuffle && AllowVariableMask &&
35685 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
35686 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
35687 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
35688 SmallVector<SDValue, 16> PSHUFBMask;
35689 int NumBytes = RootVT.getSizeInBits() / 8;
35690 int Ratio = NumBytes / NumMaskElts;
35691 for (int i = 0; i < NumBytes; ++i) {
35692 int M = Mask[i / Ratio];
35693 if (M == SM_SentinelUndef) {
35694 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
35695 continue;
35696 }
35697 if (M == SM_SentinelZero) {
35698 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
35699 continue;
35700 }
35701 M = Ratio * M + i % Ratio;
35702 assert((M / 16) == (i / 16) && "Lane crossing detected")(((M / 16) == (i / 16) && "Lane crossing detected") ?
static_cast<void> (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35702, __PRETTY_FUNCTION__))
;
35703 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
35704 }
35705 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
35706 Res = CanonicalizeShuffleInput(ByteVT, V1);
35707 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
35708 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
35709 return DAG.getBitcast(RootVT, Res);
35710 }
35711
35712 // With XOP, if we have a 128-bit binary input shuffle we can always combine
35713 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
35714 // slower than PSHUFB on targets that support both.
35715 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
35716 // VPPERM Mask Operation
35717 // Bits[4:0] - Byte Index (0 - 31)
35718 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
35719 SmallVector<SDValue, 16> VPPERMMask;
35720 int NumBytes = 16;
35721 int Ratio = NumBytes / NumMaskElts;
35722 for (int i = 0; i < NumBytes; ++i) {
35723 int M = Mask[i / Ratio];
35724 if (M == SM_SentinelUndef) {
35725 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
35726 continue;
35727 }
35728 if (M == SM_SentinelZero) {
35729 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
35730 continue;
35731 }
35732 M = Ratio * M + i % Ratio;
35733 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
35734 }
35735 MVT ByteVT = MVT::v16i8;
35736 V1 = CanonicalizeShuffleInput(ByteVT, V1);
35737 V2 = CanonicalizeShuffleInput(ByteVT, V2);
35738 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
35739 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
35740 return DAG.getBitcast(RootVT, Res);
35741 }
35742
35743 // If that failed and either input is extracted then try to combine as a
35744 // shuffle with the larger type.
35745 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
35746 Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
35747 DAG, Subtarget))
35748 return WideShuffle;
35749
35750 // If we have a dual input shuffle then lower to VPERMV3,
35751 // (non-VLX will pad to 512-bit shuffles)
35752 if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
35753 ((Subtarget.hasAVX512() &&
35754 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
35755 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
35756 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
35757 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
35758 MaskVT == MVT::v16i32)) ||
35759 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
35760 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
35761 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
35762 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
35763 V1 = CanonicalizeShuffleInput(MaskVT, V1);
35764 V2 = CanonicalizeShuffleInput(MaskVT, V2);
35765 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
35766 return DAG.getBitcast(RootVT, Res);
35767 }
35768
35769 // Failed to find any combines.
35770 return SDValue();
35771}
35772
35773// Combine an arbitrary chain of shuffles + extract_subvectors into a single
35774// instruction if possible.
35775//
35776// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
35777// type size to attempt to combine:
35778// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
35779// -->
35780// extract_subvector(shuffle(x,y,m2),0)
35781static SDValue combineX86ShuffleChainWithExtract(
35782 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
35783 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
35784 const X86Subtarget &Subtarget) {
35785 unsigned NumMaskElts = BaseMask.size();
35786 unsigned NumInputs = Inputs.size();
35787 if (NumInputs == 0)
35788 return SDValue();
35789
35790 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
35791 SmallVector<unsigned, 4> Offsets(NumInputs, 0);
35792
35793 // Peek through subvectors.
35794 // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
35795 unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
35796 for (unsigned i = 0; i != NumInputs; ++i) {
35797 SDValue &Src = WideInputs[i];
35798 unsigned &Offset = Offsets[i];
35799 Src = peekThroughBitcasts(Src);
35800 EVT BaseVT = Src.getValueType();
35801 while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
35802 Offset += Src.getConstantOperandVal(1);
35803 Src = Src.getOperand(0);
35804 }
35805 WideSizeInBits = std::max(WideSizeInBits,
35806 (unsigned)Src.getValueSizeInBits());
35807 assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35808, __PRETTY_FUNCTION__))
35808 "Unexpected subvector extraction")(((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35808, __PRETTY_FUNCTION__))
;
35809 Offset /= BaseVT.getVectorNumElements();
35810 Offset *= NumMaskElts;
35811 }
35812
35813 // Bail if we're always extracting from the lowest subvectors,
35814 // combineX86ShuffleChain should match this for the current width.
35815 if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
35816 return SDValue();
35817
35818 EVT RootVT = Root.getValueType();
35819 unsigned RootSizeInBits = RootVT.getSizeInBits();
35820 unsigned Scale = WideSizeInBits / RootSizeInBits;
35821 assert((WideSizeInBits % RootSizeInBits) == 0 &&(((WideSizeInBits % RootSizeInBits) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35822, __PRETTY_FUNCTION__))
35822 "Unexpected subvector extraction")(((WideSizeInBits % RootSizeInBits) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35822, __PRETTY_FUNCTION__))
;
35823
35824 // If the src vector types aren't the same, see if we can extend
35825 // them to match each other.
35826 // TODO: Support different scalar types?
35827 EVT WideSVT = WideInputs[0].getValueType().getScalarType();
35828 if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
35829 return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
35830 Op.getValueType().getScalarType() != WideSVT;
35831 }))
35832 return SDValue();
35833
35834 for (SDValue &NewInput : WideInputs) {
35835 assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&(((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch") ? static_cast<void> (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35836, __PRETTY_FUNCTION__))
35836 "Shuffle vector size mismatch")(((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch") ? static_cast<void> (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35836, __PRETTY_FUNCTION__))
;
35837 if (WideSizeInBits > NewInput.getValueSizeInBits())
35838 NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
35839 SDLoc(NewInput), WideSizeInBits);
35840 assert(WideSizeInBits == NewInput.getValueSizeInBits() &&((WideSizeInBits == NewInput.getValueSizeInBits() && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35841, __PRETTY_FUNCTION__))
35841 "Unexpected subvector extraction")((WideSizeInBits == NewInput.getValueSizeInBits() && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35841, __PRETTY_FUNCTION__))
;
35842 }
35843
35844 // Create new mask for larger type.
35845 for (unsigned i = 1; i != NumInputs; ++i)
35846 Offsets[i] += i * Scale * NumMaskElts;
35847
35848 SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
35849 for (int &M : WideMask) {
35850 if (M < 0)
35851 continue;
35852 M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
35853 }
35854 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
35855
35856 // Remove unused/repeated shuffle source ops.
35857 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
35858 assert(!WideInputs.empty() && "Shuffle with no inputs detected")((!WideInputs.empty() && "Shuffle with no inputs detected"
) ? static_cast<void> (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35858, __PRETTY_FUNCTION__))
;
35859
35860 if (WideInputs.size() > 2)
35861 return SDValue();
35862
35863 // Increase depth for every upper subvector we've peeked through.
35864 Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
35865
35866 // Attempt to combine wider chain.
35867 // TODO: Can we use a better Root?
35868 SDValue WideRoot = WideInputs[0];
35869 if (SDValue WideShuffle = combineX86ShuffleChain(
35870 WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
35871 AllowVariableMask, DAG, Subtarget)) {
35872 WideShuffle =
35873 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
35874 return DAG.getBitcast(RootVT, WideShuffle);
35875 }
35876 return SDValue();
35877}
35878
35879// Canonicalize the combined shuffle mask chain with horizontal ops.
35880// NOTE: This may update the Ops and Mask.
35881static SDValue canonicalizeShuffleMaskWithHorizOp(
35882 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
35883 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
35884 const X86Subtarget &Subtarget) {
35885
35886 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
35887 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
35888 // represents the LHS/RHS inputs for the lower/upper halves.
35889 if (Mask.empty() || Ops.empty() || 2 < Ops.size())
35890 return SDValue();
35891
35892 SDValue BC0 = peekThroughBitcasts(Ops.front());
35893 SDValue BC1 = peekThroughBitcasts(Ops.back());
35894 EVT VT0 = BC0.getValueType();
35895 EVT VT1 = BC1.getValueType();
35896 unsigned Opcode0 = BC0.getOpcode();
35897 unsigned Opcode1 = BC1.getOpcode();
35898 if (Opcode0 != Opcode1 || VT0 != VT1 || VT0.getSizeInBits() != RootSizeInBits)
35899 return SDValue();
35900
35901 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
35902 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
35903 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
35904 if (!isHoriz && !isPack)
35905 return SDValue();
35906
35907 if (Mask.size() == VT0.getVectorNumElements()) {
35908 int NumElts = VT0.getVectorNumElements();
35909 int NumLanes = VT0.getSizeInBits() / 128;
35910 int NumEltsPerLane = NumElts / NumLanes;
35911 int NumHalfEltsPerLane = NumEltsPerLane / 2;
35912
35913 // Canonicalize binary shuffles of horizontal ops that use the
35914 // same sources to an unary shuffle.
35915 // TODO: Try to perform this fold even if the shuffle remains.
35916 if (Ops.size() == 2) {
35917 auto ContainsOps = [](SDValue HOp, SDValue Op) {
35918 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
35919 };
35920 // Commute if all BC0's ops are contained in BC1.
35921 if (ContainsOps(BC1, BC0.getOperand(0)) &&
35922 ContainsOps(BC1, BC0.getOperand(1))) {
35923 ShuffleVectorSDNode::commuteMask(Mask);
35924 std::swap(Ops[0], Ops[1]);
35925 std::swap(BC0, BC1);
35926 }
35927
35928 // If BC1 can be represented by BC0, then convert to unary shuffle.
35929 if (ContainsOps(BC0, BC1.getOperand(0)) &&
35930 ContainsOps(BC0, BC1.getOperand(1))) {
35931 for (int &M : Mask) {
35932 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
35933 continue;
35934 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
35935 M -= NumElts + (SubLane * NumHalfEltsPerLane);
35936 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
35937 M += NumHalfEltsPerLane;
35938 }
35939 }
35940 }
35941
35942 // Canonicalize unary horizontal ops to only refer to lower halves.
35943 for (int i = 0; i != NumElts; ++i) {
35944 int &M = Mask[i];
35945 if (isUndefOrZero(M))
35946 continue;
35947 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
35948 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
35949 M -= NumHalfEltsPerLane;
35950 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
35951 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
35952 M -= NumHalfEltsPerLane;
35953 }
35954 }
35955
35956 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
35957 SmallVector<int, 16> TargetMask128, WideMask128;
35958 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
35959 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
35960 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")((isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35960, __PRETTY_FUNCTION__))
;
35961 bool SingleOp = (Ops.size() == 1);
35962 if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
35963 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
35964 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
35965 Lo = Lo.getOperand(WideMask128[0] & 1);
35966 Hi = Hi.getOperand(WideMask128[1] & 1);
35967 if (SingleOp) {
35968 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
35969 SDValue Undef = DAG.getUNDEF(SrcVT);
35970 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
35971 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
35972 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
35973 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
35974 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
35975 }
35976 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
35977 }
35978 }
35979
35980 return SDValue();
35981}
35982
35983// Attempt to constant fold all of the constant source ops.
35984// Returns true if the entire shuffle is folded to a constant.
35985// TODO: Extend this to merge multiple constant Ops and update the mask.
35986static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
35987 ArrayRef<int> Mask, SDValue Root,
35988 bool HasVariableMask,
35989 SelectionDAG &DAG,
35990 const X86Subtarget &Subtarget) {
35991 MVT VT = Root.getSimpleValueType();
35992
35993 unsigned SizeInBits = VT.getSizeInBits();
35994 unsigned NumMaskElts = Mask.size();
35995 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
35996 unsigned NumOps = Ops.size();
35997
35998 // Extract constant bits from each source op.
35999 bool OneUseConstantOp = false;
36000 SmallVector<APInt, 16> UndefEltsOps(NumOps);
36001 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
36002 for (unsigned i = 0; i != NumOps; ++i) {
36003 SDValue SrcOp = Ops[i];
36004 OneUseConstantOp |= SrcOp.hasOneUse();
36005 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
36006 RawBitsOps[i]))
36007 return SDValue();
36008 }
36009
36010 // Only fold if at least one of the constants is only used once or
36011 // the combined shuffle has included a variable mask shuffle, this
36012 // is to avoid constant pool bloat.
36013 if (!OneUseConstantOp && !HasVariableMask)
36014 return SDValue();
36015
36016 // Shuffle the constant bits according to the mask.
36017 SDLoc DL(Root);
36018 APInt UndefElts(NumMaskElts, 0);
36019 APInt ZeroElts(NumMaskElts, 0);
36020 APInt ConstantElts(NumMaskElts, 0);
36021 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
36022 APInt::getNullValue(MaskSizeInBits));
36023 for (unsigned i = 0; i != NumMaskElts; ++i) {
36024 int M = Mask[i];
36025 if (M == SM_SentinelUndef) {
36026 UndefElts.setBit(i);
36027 continue;
36028 } else if (M == SM_SentinelZero) {
36029 ZeroElts.setBit(i);
36030 continue;
36031 }
36032 assert(0 <= M && M < (int)(NumMaskElts * NumOps))((0 <= M && M < (int)(NumMaskElts * NumOps)) ? static_cast
<void> (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36032, __PRETTY_FUNCTION__))
;
36033
36034 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
36035 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
36036
36037 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
36038 if (SrcUndefElts[SrcMaskIdx]) {
36039 UndefElts.setBit(i);
36040 continue;
36041 }
36042
36043 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
36044 APInt &Bits = SrcEltBits[SrcMaskIdx];
36045 if (!Bits) {
36046 ZeroElts.setBit(i);
36047 continue;
36048 }
36049
36050 ConstantElts.setBit(i);
36051 ConstantBitData[i] = Bits;
36052 }
36053 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue())(((UndefElts | ZeroElts | ConstantElts).isAllOnesValue()) ? static_cast
<void> (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnesValue()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36053, __PRETTY_FUNCTION__))
;
36054
36055 // Attempt to create a zero vector.
36056 if ((UndefElts | ZeroElts).isAllOnesValue())
36057 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
36058
36059 // Create the constant data.
36060 MVT MaskSVT;
36061 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
36062 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
36063 else
36064 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
36065
36066 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
36067 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36068 return SDValue();
36069
36070 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
36071 return DAG.getBitcast(VT, CstOp);
36072}
36073
36074namespace llvm {
36075 namespace X86 {
36076 enum {
36077 MaxShuffleCombineDepth = 8
36078 };
36079 }
36080} // namespace llvm
36081
36082/// Fully generic combining of x86 shuffle instructions.
36083///
36084/// This should be the last combine run over the x86 shuffle instructions. Once
36085/// they have been fully optimized, this will recursively consider all chains
36086/// of single-use shuffle instructions, build a generic model of the cumulative
36087/// shuffle operation, and check for simpler instructions which implement this
36088/// operation. We use this primarily for two purposes:
36089///
36090/// 1) Collapse generic shuffles to specialized single instructions when
36091/// equivalent. In most cases, this is just an encoding size win, but
36092/// sometimes we will collapse multiple generic shuffles into a single
36093/// special-purpose shuffle.
36094/// 2) Look for sequences of shuffle instructions with 3 or more total
36095/// instructions, and replace them with the slightly more expensive SSSE3
36096/// PSHUFB instruction if available. We do this as the last combining step
36097/// to ensure we avoid using PSHUFB if we can implement the shuffle with
36098/// a suitable short sequence of other instructions. The PSHUFB will either
36099/// use a register or have to read from memory and so is slightly (but only
36100/// slightly) more expensive than the other shuffle instructions.
36101///
36102/// Because this is inherently a quadratic operation (for each shuffle in
36103/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
36104/// This should never be an issue in practice as the shuffle lowering doesn't
36105/// produce sequences of more than 8 instructions.
36106///
36107/// FIXME: We will currently miss some cases where the redundant shuffling
36108/// would simplify under the threshold for PSHUFB formation because of
36109/// combine-ordering. To fix this, we should do the redundant instruction
36110/// combining in this recursive walk.
36111static SDValue combineX86ShufflesRecursively(
36112 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
36113 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
36114 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask,
36115 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
36116 assert(RootMask.size() > 0 &&((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36118, __PRETTY_FUNCTION__))
1
Assuming the condition is true
2
Assuming the condition is false
3
Assuming the condition is true
4
Assuming 'SrcOpIndex' is equal to 0
5
'?' condition is true
36117 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36118, __PRETTY_FUNCTION__))
36118 "Illegal shuffle root mask")((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36118, __PRETTY_FUNCTION__))
;
36119 assert(Root.getSimpleValueType().isVector() &&((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36120, __PRETTY_FUNCTION__))
6
'?' condition is true
36120 "Shuffles operate on vector types!")((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36120, __PRETTY_FUNCTION__))
;
36121 unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
36122
36123 // Bound the depth of our recursive combine because this is ultimately
36124 // quadratic in nature.
36125 if (Depth >= MaxDepth)
7
Assuming 'Depth' is < 'MaxDepth'
8
Taking false branch
36126 return SDValue();
36127
36128 // Directly rip through bitcasts to find the underlying operand.
36129 SDValue Op = SrcOps[SrcOpIndex];
36130 Op = peekThroughOneUseBitcasts(Op);
36131
36132 EVT VT = Op.getValueType();
36133 if (!VT.isVector() || !VT.isSimple())
9
Calling 'EVT::isVector'
17
Returning from 'EVT::isVector'
18
Calling 'EVT::isSimple'
20
Returning from 'EVT::isSimple'
21
Taking false branch
36134 return SDValue(); // Bail if we hit a non-simple non-vector.
36135
36136 assert(VT.getSizeInBits() == RootSizeInBits &&((VT.getSizeInBits() == RootSizeInBits && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == RootSizeInBits && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36137, __PRETTY_FUNCTION__))
22
Assuming the condition is true
23
'?' condition is true
36137 "Can only combine shuffles of the same vector register size.")((VT.getSizeInBits() == RootSizeInBits && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == RootSizeInBits && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36137, __PRETTY_FUNCTION__))
;
36138
36139 // Extract target shuffle mask and resolve sentinels and inputs.
36140 // TODO - determine Op's demanded elts from RootMask.
36141 SmallVector<int, 64> OpMask;
36142 SmallVector<SDValue, 2> OpInputs;
36143 APInt OpUndef, OpZero;
36144 APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
36145 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
36146 if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
24
Calling 'getTargetShuffleInputs'
41
Returning from 'getTargetShuffleInputs'
42
Taking false branch
36147 OpZero, DAG, Depth, false))
36148 return SDValue();
36149
36150 // Shuffle inputs must be the same size as the result, bail on any larger
36151 // inputs and widen any smaller inputs.
36152 if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {
43
Calling 'any_of<llvm::SmallVector<llvm::SDValue, 2> &, (lambda at /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp:36152:30)>'
51
Returning from 'any_of<llvm::SmallVector<llvm::SDValue, 2> &, (lambda at /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp:36152:30)>'
52
Taking false branch
36153 return Op.getValueSizeInBits() > RootSizeInBits;
36154 }))
36155 return SDValue();
36156
36157 for (SDValue &Op : OpInputs)
53
Assuming '__begin1' is equal to '__end1'
36158 if (Op.getValueSizeInBits() < RootSizeInBits)
36159 Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,
36160 SDLoc(Op), RootSizeInBits);
36161
36162 SmallVector<int, 64> Mask;
36163 SmallVector<SDValue, 16> Ops;
36164
36165 // We don't need to merge masks if the root is empty.
36166 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
54
Assuming 'Depth' is not equal to 0
36167 if (EmptyRoot
54.1
'EmptyRoot' is false
54.1
'EmptyRoot' is false
54.1
'EmptyRoot' is false
54.1
'EmptyRoot' is false
54.1
'EmptyRoot' is false
54.1
'EmptyRoot' is false
) {
55
Taking false branch
36168 // Only resolve zeros if it will remove an input, otherwise we might end
36169 // up in an infinite loop.
36170 bool ResolveKnownZeros = true;
36171 if (!OpZero.isNullValue()) {
36172 APInt UsedInputs = APInt::getNullValue(OpInputs.size());
36173 for (int i = 0, e = OpMask.size(); i != e; ++i) {
36174 int M = OpMask[i];
36175 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
36176 continue;
36177 UsedInputs.setBit(M / OpMask.size());
36178 if (UsedInputs.isAllOnesValue()) {
36179 ResolveKnownZeros = false;
36180 break;
36181 }
36182 }
36183 }
36184 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
36185 ResolveKnownZeros);
36186
36187 Mask = OpMask;
36188 Ops.append(OpInputs.begin(), OpInputs.end());
36189 } else {
36190 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
56
Calling 'resolveTargetShuffleFromZeroables'
63
Returning from 'resolveTargetShuffleFromZeroables'
36191
36192 // Add the inputs to the Ops list, avoiding duplicates.
36193 Ops.append(SrcOps.begin(), SrcOps.end());
36194
36195 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
36196 // Attempt to find an existing match.
36197 SDValue InputBC = peekThroughBitcasts(Input);
36198 for (int i = 0, e = Ops.size(); i < e; ++i)
36199 if (InputBC == peekThroughBitcasts(Ops[i]))
36200 return i;
36201 // Match failed - should we replace an existing Op?
36202 if (InsertionPoint >= 0) {
36203 Ops[InsertionPoint] = Input;
36204 return InsertionPoint;
36205 }
36206 // Add to the end of the Ops list.
36207 Ops.push_back(Input);
36208 return Ops.size() - 1;
36209 };
36210
36211 SmallVector<int, 2> OpInputIdx;
36212 for (SDValue OpInput : OpInputs)
64
Assuming '__begin2' is equal to '__end2'
36213 OpInputIdx.push_back(
36214 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
36215
36216 assert(((RootMask.size() > OpMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36221, __PRETTY_FUNCTION__))
65
Assuming the condition is true
66
Calling 'SmallVectorBase::size'
68
Returning from 'SmallVectorBase::size'
69
Division by zero
36217 RootMask.size() % OpMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36221, __PRETTY_FUNCTION__))
36218 (OpMask.size() > RootMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36221, __PRETTY_FUNCTION__))
36219 OpMask.size() % RootMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36221, __PRETTY_FUNCTION__))
36220 OpMask.size() == RootMask.size()) &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36221, __PRETTY_FUNCTION__))
36221 "The smaller number of elements must divide the larger.")((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36221, __PRETTY_FUNCTION__))
;
36222
36223 // This function can be performance-critical, so we rely on the power-of-2
36224 // knowledge that we have about the mask sizes to replace div/rem ops with
36225 // bit-masks and shifts.
36226 assert(isPowerOf2_32(RootMask.size()) &&((isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36227, __PRETTY_FUNCTION__))
36227 "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36227, __PRETTY_FUNCTION__))
;
36228 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36228, __PRETTY_FUNCTION__))
;
36229 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
36230 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
36231
36232 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
36233 unsigned RootRatio =
36234 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
36235 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
36236 assert((RootRatio == 1 || OpRatio == 1) &&(((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36237, __PRETTY_FUNCTION__))
36237 "Must not have a ratio for both incoming and op masks!")(((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36237, __PRETTY_FUNCTION__))
;
36238
36239 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36239, __PRETTY_FUNCTION__))
;
36240 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36240, __PRETTY_FUNCTION__))
;
36241 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36241, __PRETTY_FUNCTION__))
;
36242 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
36243 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
36244
36245 Mask.resize(MaskWidth, SM_SentinelUndef);
36246
36247 // Merge this shuffle operation's mask into our accumulated mask. Note that
36248 // this shuffle's mask will be the first applied to the input, followed by
36249 // the root mask to get us all the way to the root value arrangement. The
36250 // reason for this order is that we are recursing up the operation chain.
36251 for (unsigned i = 0; i < MaskWidth; ++i) {
36252 unsigned RootIdx = i >> RootRatioLog2;
36253 if (RootMask[RootIdx] < 0) {
36254 // This is a zero or undef lane, we're done.
36255 Mask[i] = RootMask[RootIdx];
36256 continue;
36257 }
36258
36259 unsigned RootMaskedIdx =
36260 RootRatio == 1
36261 ? RootMask[RootIdx]
36262 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
36263
36264 // Just insert the scaled root mask value if it references an input other
36265 // than the SrcOp we're currently inserting.
36266 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
36267 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
36268 Mask[i] = RootMaskedIdx;
36269 continue;
36270 }
36271
36272 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
36273 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
36274 if (OpMask[OpIdx] < 0) {
36275 // The incoming lanes are zero or undef, it doesn't matter which ones we
36276 // are using.
36277 Mask[i] = OpMask[OpIdx];
36278 continue;
36279 }
36280
36281 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
36282 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
36283 : (OpMask[OpIdx] << OpRatioLog2) +
36284 (RootMaskedIdx & (OpRatio - 1));
36285
36286 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
36287 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
36288 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")((0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"
) ? static_cast<void> (0) : __assert_fail ("0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36288, __PRETTY_FUNCTION__))
;
36289 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
36290
36291 Mask[i] = OpMaskedIdx;
36292 }
36293 }
36294
36295 // Remove unused/repeated shuffle source ops.
36296 resolveTargetShuffleInputsAndMask(Ops, Mask);
36297
36298 // Handle the all undef/zero cases early.
36299 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
36300 return DAG.getUNDEF(Root.getValueType());
36301 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
36302 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
36303 SDLoc(Root));
36304
36305 assert(!Ops.empty() && "Shuffle with no inputs detected")((!Ops.empty() && "Shuffle with no inputs detected") ?
static_cast<void> (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36305, __PRETTY_FUNCTION__))
;
36306 HasVariableMask |= IsOpVariableMask;
36307
36308 // Update the list of shuffle nodes that have been combined so far.
36309 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
36310 SrcNodes.end());
36311 CombinedNodes.push_back(Op.getNode());
36312
36313 // See if we can recurse into each shuffle source op (if it's a target
36314 // shuffle). The source op should only be generally combined if it either has
36315 // a single use (i.e. current Op) or all its users have already been combined,
36316 // if not then we can still combine but should prevent generation of variable
36317 // shuffles to avoid constant pool bloat.
36318 // Don't recurse if we already have more source ops than we can combine in
36319 // the remaining recursion depth.
36320 if (Ops.size() < (MaxDepth - Depth)) {
36321 for (int i = 0, e = Ops.size(); i < e; ++i) {
36322 // For empty roots, we need to resolve zeroable elements before combining
36323 // them with other shuffles.
36324 SmallVector<int, 64> ResolvedMask = Mask;
36325 if (EmptyRoot)
36326 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
36327 bool AllowVar = false;
36328 if (Ops[i].getNode()->hasOneUse() ||
36329 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
36330 AllowVar = AllowVariableMask;
36331 if (SDValue Res = combineX86ShufflesRecursively(
36332 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
36333 HasVariableMask, AllowVar, DAG, Subtarget))
36334 return Res;
36335 }
36336 }
36337
36338 // Attempt to constant fold all of the constant source ops.
36339 if (SDValue Cst = combineX86ShufflesConstants(
36340 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
36341 return Cst;
36342
36343 // Canonicalize the combined shuffle mask chain with horizontal ops.
36344 // NOTE: This will update the Ops and Mask.
36345 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
36346 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
36347 return DAG.getBitcast(Root.getValueType(), HOp);
36348
36349 // We can only combine unary and binary shuffle mask cases.
36350 if (Ops.size() <= 2) {
36351 // Minor canonicalization of the accumulated shuffle mask to make it easier
36352 // to match below. All this does is detect masks with sequential pairs of
36353 // elements, and shrink them to the half-width mask. It does this in a loop
36354 // so it will reduce the size of the mask to the minimal width mask which
36355 // performs an equivalent shuffle.
36356 while (Mask.size() > 1) {
36357 SmallVector<int, 64> WidenedMask;
36358 if (!canWidenShuffleElements(Mask, WidenedMask))
36359 break;
36360 Mask = std::move(WidenedMask);
36361 }
36362
36363 // Canonicalization of binary shuffle masks to improve pattern matching by
36364 // commuting the inputs.
36365 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
36366 ShuffleVectorSDNode::commuteMask(Mask);
36367 std::swap(Ops[0], Ops[1]);
36368 }
36369
36370 // Finally, try to combine into a single shuffle instruction.
36371 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
36372 AllowVariableMask, DAG, Subtarget);
36373 }
36374
36375 // If that failed and any input is extracted then try to combine as a
36376 // shuffle with the larger type.
36377 return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
36378 HasVariableMask, AllowVariableMask,
36379 DAG, Subtarget);
36380}
36381
36382/// Helper entry wrapper to combineX86ShufflesRecursively.
36383static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
36384 const X86Subtarget &Subtarget) {
36385 return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
36386 X86::MaxShuffleCombineDepth,
36387 /*HasVarMask*/ false,
36388 /*AllowVarMask*/ true, DAG, Subtarget);
36389}
36390
36391/// Get the PSHUF-style mask from PSHUF node.
36392///
36393/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
36394/// PSHUF-style masks that can be reused with such instructions.
36395static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
36396 MVT VT = N.getSimpleValueType();
36397 SmallVector<int, 4> Mask;
36398 SmallVector<SDValue, 2> Ops;
36399 bool IsUnary;
36400 bool HaveMask =
36401 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
36402 (void)HaveMask;
36403 assert(HaveMask)((HaveMask) ? static_cast<void> (0) : __assert_fail ("HaveMask"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36403, __PRETTY_FUNCTION__))
;
36404
36405 // If we have more than 128-bits, only the low 128-bits of shuffle mask
36406 // matter. Check that the upper masks are repeats and remove them.
36407 if (VT.getSizeInBits() > 128) {
36408 int LaneElts = 128 / VT.getScalarSizeInBits();
36409#ifndef NDEBUG
36410 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
36411 for (int j = 0; j < LaneElts; ++j)
36412 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!") ? static_cast<
void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36413, __PRETTY_FUNCTION__))
36413 "Mask doesn't repeat in high 128-bit lanes!")((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!") ? static_cast<
void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36413, __PRETTY_FUNCTION__))
;
36414#endif
36415 Mask.resize(LaneElts);
36416 }
36417
36418 switch (N.getOpcode()) {
36419 case X86ISD::PSHUFD:
36420 return Mask;
36421 case X86ISD::PSHUFLW:
36422 Mask.resize(4);
36423 return Mask;
36424 case X86ISD::PSHUFHW:
36425 Mask.erase(Mask.begin(), Mask.begin() + 4);
36426 for (int &M : Mask)
36427 M -= 4;
36428 return Mask;
36429 default:
36430 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36430)
;
36431 }
36432}
36433
36434/// Search for a combinable shuffle across a chain ending in pshufd.
36435///
36436/// We walk up the chain and look for a combinable shuffle, skipping over
36437/// shuffles that we could hoist this shuffle's transformation past without
36438/// altering anything.
36439static SDValue
36440combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
36441 SelectionDAG &DAG) {
36442 assert(N.getOpcode() == X86ISD::PSHUFD &&((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36443, __PRETTY_FUNCTION__))
36443 "Called with something other than an x86 128-bit half shuffle!")((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36443, __PRETTY_FUNCTION__))
;
36444 SDLoc DL(N);
36445
36446 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
36447 // of the shuffles in the chain so that we can form a fresh chain to replace
36448 // this one.
36449 SmallVector<SDValue, 8> Chain;
36450 SDValue V = N.getOperand(0);
36451 for (; V.hasOneUse(); V = V.getOperand(0)) {
36452 switch (V.getOpcode()) {
36453 default:
36454 return SDValue(); // Nothing combined!
36455
36456 case ISD::BITCAST:
36457 // Skip bitcasts as we always know the type for the target specific
36458 // instructions.
36459 continue;
36460
36461 case X86ISD::PSHUFD:
36462 // Found another dword shuffle.
36463 break;
36464
36465 case X86ISD::PSHUFLW:
36466 // Check that the low words (being shuffled) are the identity in the
36467 // dword shuffle, and the high words are self-contained.
36468 if (Mask[0] != 0 || Mask[1] != 1 ||
36469 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
36470 return SDValue();
36471
36472 Chain.push_back(V);
36473 continue;
36474
36475 case X86ISD::PSHUFHW:
36476 // Check that the high words (being shuffled) are the identity in the
36477 // dword shuffle, and the low words are self-contained.
36478 if (Mask[2] != 2 || Mask[3] != 3 ||
36479 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
36480 return SDValue();
36481
36482 Chain.push_back(V);
36483 continue;
36484
36485 case X86ISD::UNPCKL:
36486 case X86ISD::UNPCKH:
36487 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
36488 // shuffle into a preceding word shuffle.
36489 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
36490 V.getSimpleValueType().getVectorElementType() != MVT::i16)
36491 return SDValue();
36492
36493 // Search for a half-shuffle which we can combine with.
36494 unsigned CombineOp =
36495 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
36496 if (V.getOperand(0) != V.getOperand(1) ||
36497 !V->isOnlyUserOf(V.getOperand(0).getNode()))
36498 return SDValue();
36499 Chain.push_back(V);
36500 V = V.getOperand(0);
36501 do {
36502 switch (V.getOpcode()) {
36503 default:
36504 return SDValue(); // Nothing to combine.
36505
36506 case X86ISD::PSHUFLW:
36507 case X86ISD::PSHUFHW:
36508 if (V.getOpcode() == CombineOp)
36509 break;
36510
36511 Chain.push_back(V);
36512
36513 LLVM_FALLTHROUGH[[gnu::fallthrough]];
36514 case ISD::BITCAST:
36515 V = V.getOperand(0);
36516 continue;
36517 }
36518 break;
36519 } while (V.hasOneUse());
36520 break;
36521 }
36522 // Break out of the loop if we break out of the switch.
36523 break;
36524 }
36525
36526 if (!V.hasOneUse())
36527 // We fell out of the loop without finding a viable combining instruction.
36528 return SDValue();
36529
36530 // Merge this node's mask and our incoming mask.
36531 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
36532 for (int &M : Mask)
36533 M = VMask[M];
36534 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
36535 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
36536
36537 // Rebuild the chain around this new shuffle.
36538 while (!Chain.empty()) {
36539 SDValue W = Chain.pop_back_val();
36540
36541 if (V.getValueType() != W.getOperand(0).getValueType())
36542 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
36543
36544 switch (W.getOpcode()) {
36545 default:
36546 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36546)
;
36547
36548 case X86ISD::UNPCKL:
36549 case X86ISD::UNPCKH:
36550 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
36551 break;
36552
36553 case X86ISD::PSHUFD:
36554 case X86ISD::PSHUFLW:
36555 case X86ISD::PSHUFHW:
36556 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
36557 break;
36558 }
36559 }
36560 if (V.getValueType() != N.getValueType())
36561 V = DAG.getBitcast(N.getValueType(), V);
36562
36563 // Return the new chain to replace N.
36564 return V;
36565}
36566
36567// Attempt to commute shufps LHS loads:
36568// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
36569static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
36570 SelectionDAG &DAG) {
36571 // TODO: Add vXf64 support.
36572 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
36573 return SDValue();
36574
36575 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
36576 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
36577 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
36578 return SDValue();
36579 SDValue N0 = V.getOperand(0);
36580 SDValue N1 = V.getOperand(1);
36581 unsigned Imm = V.getConstantOperandVal(2);
36582 if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
36583 MayFoldLoad(peekThroughOneUseBitcasts(N1)))
36584 return SDValue();
36585 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
36586 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
36587 DAG.getTargetConstant(Imm, DL, MVT::i8));
36588 };
36589
36590 switch (N.getOpcode()) {
36591 case X86ISD::VPERMILPI:
36592 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
36593 unsigned Imm = N.getConstantOperandVal(1);
36594 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
36595 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
36596 }
36597 break;
36598 case X86ISD::SHUFP: {
36599 SDValue N0 = N.getOperand(0);
36600 SDValue N1 = N.getOperand(1);
36601 unsigned Imm = N.getConstantOperandVal(2);
36602 if (N0 == N1) {
36603 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
36604 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
36605 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
36606 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
36607 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
36608 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
36609 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
36610 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
36611 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
36612 }
36613 break;
36614 }
36615 }
36616
36617 return SDValue();
36618}
36619
36620/// Try to combine x86 target specific shuffles.
36621static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
36622 TargetLowering::DAGCombinerInfo &DCI,
36623 const X86Subtarget &Subtarget) {
36624 SDLoc DL(N);
36625 MVT VT = N.getSimpleValueType();
36626 SmallVector<int, 4> Mask;
36627 unsigned Opcode = N.getOpcode();
36628
36629 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
36630 return R;
36631
36632 // Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to
36633 // help expose the 'NOT' pattern further up the DAG.
36634 // TODO: This might be beneficial for any binop with a 'splattable' operand.
36635 switch (Opcode) {
36636 case X86ISD::MOVDDUP:
36637 case X86ISD::PSHUFD: {
36638 SDValue Src = N.getOperand(0);
36639 if (Src.hasOneUse() && Src.getValueType() == VT) {
36640 if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) {
36641 Not = DAG.getBitcast(VT, Not);
36642 Not = Opcode == X86ISD::MOVDDUP
36643 ? DAG.getNode(Opcode, DL, VT, Not)
36644 : DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));
36645 EVT IntVT = Not.getValueType().changeTypeToInteger();
36646 SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
36647 Not = DAG.getBitcast(IntVT, Not);
36648 Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
36649 return DAG.getBitcast(VT, Not);
36650 }
36651 }
36652 break;
36653 }
36654 }
36655
36656 // Handle specific target shuffles.
36657 switch (Opcode) {
36658 case X86ISD::MOVDDUP: {
36659 SDValue Src = N.getOperand(0);
36660 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
36661 if (VT == MVT::v2f64 && Src.hasOneUse() &&
36662 ISD::isNormalLoad(Src.getNode())) {
36663 LoadSDNode *LN = cast<LoadSDNode>(Src);
36664 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
36665 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
36666 DCI.CombineTo(N.getNode(), Movddup);
36667 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
36668 DCI.recursivelyDeleteUnusedNodes(LN);
36669 return N; // Return N so it doesn't get rechecked!
36670 }
36671 }
36672
36673 return SDValue();
36674 }
36675 case X86ISD::VBROADCAST: {
36676 SDValue Src = N.getOperand(0);
36677 SDValue BC = peekThroughBitcasts(Src);
36678 EVT SrcVT = Src.getValueType();
36679 EVT BCVT = BC.getValueType();
36680
36681 // If broadcasting from another shuffle, attempt to simplify it.
36682 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
36683 if (isTargetShuffle(BC.getOpcode()) &&
36684 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
36685 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
36686 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
36687 SM_SentinelUndef);
36688 for (unsigned i = 0; i != Scale; ++i)
36689 DemandedMask[i] = i;
36690 if (SDValue Res = combineX86ShufflesRecursively(
36691 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
36692 X86::MaxShuffleCombineDepth,
36693 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
36694 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
36695 DAG.getBitcast(SrcVT, Res));
36696 }
36697
36698 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
36699 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
36700 if (Src.getOpcode() == ISD::BITCAST &&
36701 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
36702 DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
36703 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
36704 VT.getVectorNumElements());
36705 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
36706 }
36707
36708 // Reduce broadcast source vector to lowest 128-bits.
36709 if (SrcVT.getSizeInBits() > 128)
36710 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
36711 extract128BitVector(Src, 0, DAG, DL));
36712
36713 // broadcast(scalar_to_vector(x)) -> broadcast(x).
36714 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
36715 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
36716
36717 // Share broadcast with the longest vector and extract low subvector (free).
36718 // Ensure the same SDValue from the SDNode use is being used.
36719 for (SDNode *User : Src->uses())
36720 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
36721 Src == User->getOperand(0) &&
36722 User->getValueSizeInBits(0).getFixedSize() >
36723 VT.getFixedSizeInBits()) {
36724 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
36725 VT.getSizeInBits());
36726 }
36727
36728 // vbroadcast(scalarload X) -> vbroadcast_load X
36729 // For float loads, extract other uses of the scalar from the broadcast.
36730 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
36731 ISD::isNormalLoad(Src.getNode())) {
36732 LoadSDNode *LN = cast<LoadSDNode>(Src);
36733 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36734 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
36735 SDValue BcastLd =
36736 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
36737 LN->getMemoryVT(), LN->getMemOperand());
36738 // If the load value is used only by N, replace it via CombineTo N.
36739 bool NoReplaceExtract = Src.hasOneUse();
36740 DCI.CombineTo(N.getNode(), BcastLd);
36741 if (NoReplaceExtract) {
36742 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
36743 DCI.recursivelyDeleteUnusedNodes(LN);
36744 } else {
36745 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
36746 DAG.getIntPtrConstant(0, DL));
36747 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
36748 }
36749 return N; // Return N so it doesn't get rechecked!
36750 }
36751
36752 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
36753 // i16. So shrink it ourselves if we can make a broadcast_load.
36754 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
36755 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
36756 assert(Subtarget.hasAVX2() && "Expected AVX2")((Subtarget.hasAVX2() && "Expected AVX2") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36756, __PRETTY_FUNCTION__))
;
36757 SDValue TruncIn = Src.getOperand(0);
36758
36759 // If this is a truncate of a non extending load we can just narrow it to
36760 // use a broadcast_load.
36761 if (ISD::isNormalLoad(TruncIn.getNode())) {
36762 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
36763 // Unless its volatile or atomic.
36764 if (LN->isSimple()) {
36765 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36766 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
36767 SDValue BcastLd = DAG.getMemIntrinsicNode(
36768 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
36769 LN->getPointerInfo(), LN->getOriginalAlign(),
36770 LN->getMemOperand()->getFlags());
36771 DCI.CombineTo(N.getNode(), BcastLd);
36772 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
36773 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
36774 return N; // Return N so it doesn't get rechecked!
36775 }
36776 }
36777
36778 // If this is a truncate of an i16 extload, we can directly replace it.
36779 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
36780 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
36781 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
36782 if (LN->getMemoryVT().getSizeInBits() == 16) {
36783 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36784 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
36785 SDValue BcastLd =
36786 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
36787 LN->getMemoryVT(), LN->getMemOperand());
36788 DCI.CombineTo(N.getNode(), BcastLd);
36789 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
36790 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
36791 return N; // Return N so it doesn't get rechecked!
36792 }
36793 }
36794
36795 // If this is a truncate of load that has been shifted right, we can
36796 // offset the pointer and use a narrower load.
36797 if (TruncIn.getOpcode() == ISD::SRL &&
36798 TruncIn.getOperand(0).hasOneUse() &&
36799 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
36800 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
36801 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
36802 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
36803 // Make sure the shift amount and the load size are divisible by 16.
36804 // Don't do this if the load is volatile or atomic.
36805 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
36806 LN->isSimple()) {
36807 unsigned Offset = ShiftAmt / 8;
36808 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36809 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
36810 TypeSize::Fixed(Offset), DL);
36811 SDValue Ops[] = { LN->getChain(), Ptr };
36812 SDValue BcastLd = DAG.getMemIntrinsicNode(
36813 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
36814 LN->getPointerInfo().getWithOffset(Offset),
36815 LN->getOriginalAlign(),
36816 LN->getMemOperand()->getFlags());
36817 DCI.CombineTo(N.getNode(), BcastLd);
36818 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
36819 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
36820 return N; // Return N so it doesn't get rechecked!
36821 }
36822 }
36823 }
36824
36825 // vbroadcast(vzload X) -> vbroadcast_load X
36826 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
36827 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
36828 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
36829 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36830 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
36831 SDValue BcastLd =
36832 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
36833 LN->getMemoryVT(), LN->getMemOperand());
36834 DCI.CombineTo(N.getNode(), BcastLd);
36835 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
36836 DCI.recursivelyDeleteUnusedNodes(LN);
36837 return N; // Return N so it doesn't get rechecked!
36838 }
36839 }
36840
36841 // vbroadcast(vector load X) -> vbroadcast_load
36842 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
36843 SrcVT == MVT::v4i32) &&
36844 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
36845 LoadSDNode *LN = cast<LoadSDNode>(Src);
36846 // Unless the load is volatile or atomic.
36847 if (LN->isSimple()) {
36848 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36849 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
36850 SDValue BcastLd = DAG.getMemIntrinsicNode(
36851 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
36852 LN->getPointerInfo(), LN->getOriginalAlign(),
36853 LN->getMemOperand()->getFlags());
36854 DCI.CombineTo(N.getNode(), BcastLd);
36855 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
36856 DCI.recursivelyDeleteUnusedNodes(LN);
36857 return N; // Return N so it doesn't get rechecked!
36858 }
36859 }
36860
36861 return SDValue();
36862 }
36863 case X86ISD::VZEXT_MOVL: {
36864 SDValue N0 = N.getOperand(0);
36865
36866 // If this a vzmovl of a full vector load, replace it with a vzload, unless
36867 // the load is volatile.
36868 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
36869 auto *LN = cast<LoadSDNode>(N0);
36870 if (SDValue VZLoad =
36871 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
36872 DCI.CombineTo(N.getNode(), VZLoad);
36873 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
36874 DCI.recursivelyDeleteUnusedNodes(LN);
36875 return N;
36876 }
36877 }
36878
36879 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
36880 // and can just use a VZEXT_LOAD.
36881 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
36882 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
36883 auto *LN = cast<MemSDNode>(N0);
36884 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
36885 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36886 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
36887 SDValue VZLoad =
36888 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
36889 LN->getMemoryVT(), LN->getMemOperand());
36890 DCI.CombineTo(N.getNode(), VZLoad);
36891 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
36892 DCI.recursivelyDeleteUnusedNodes(LN);
36893 return N;
36894 }
36895 }
36896
36897 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
36898 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
36899 // if the upper bits of the i64 are zero.
36900 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36901 N0.getOperand(0).hasOneUse() &&
36902 N0.getOperand(0).getValueType() == MVT::i64) {
36903 SDValue In = N0.getOperand(0);
36904 APInt Mask = APInt::getHighBitsSet(64, 32);
36905 if (DAG.MaskedValueIsZero(In, Mask)) {
36906 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
36907 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
36908 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
36909 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
36910 return DAG.getBitcast(VT, Movl);
36911 }
36912 }
36913
36914 // Load a scalar integer constant directly to XMM instead of transferring an
36915 // immediate value from GPR.
36916 // vzext_movl (scalar_to_vector C) --> load [C,0...]
36917 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
36918 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
36919 // Create a vector constant - scalar constant followed by zeros.
36920 EVT ScalarVT = N0.getOperand(0).getValueType();
36921 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
36922 unsigned NumElts = VT.getVectorNumElements();
36923 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
36924 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
36925 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
36926
36927 // Load the vector constant from constant pool.
36928 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
36929 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
36930 MachinePointerInfo MPI =
36931 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
36932 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
36933 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
36934 MachineMemOperand::MOLoad);
36935 }
36936 }
36937
36938 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
36939 // insert into a zero vector. This helps get VZEXT_MOVL closer to
36940 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
36941 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
36942 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
36943 SDValue V = peekThroughOneUseBitcasts(N0);
36944
36945 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
36946 isNullConstant(V.getOperand(2))) {
36947 SDValue In = V.getOperand(1);
36948 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
36949 In.getValueSizeInBits() /
36950 VT.getScalarSizeInBits());
36951 In = DAG.getBitcast(SubVT, In);
36952 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
36953 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
36954 getZeroVector(VT, Subtarget, DAG, DL), Movl,
36955 V.getOperand(2));
36956 }
36957 }
36958
36959 return SDValue();
36960 }
36961 case X86ISD::BLENDI: {
36962 SDValue N0 = N.getOperand(0);
36963 SDValue N1 = N.getOperand(1);
36964
36965 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
36966 // TODO: Handle MVT::v16i16 repeated blend mask.
36967 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
36968 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
36969 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
36970 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
36971 SrcVT.getScalarSizeInBits() >= 32) {
36972 unsigned BlendMask = N.getConstantOperandVal(2);
36973 unsigned Size = VT.getVectorNumElements();
36974 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
36975 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
36976 return DAG.getBitcast(
36977 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
36978 N1.getOperand(0),
36979 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
36980 }
36981 }
36982 return SDValue();
36983 }
36984 case X86ISD::VPERMI: {
36985 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
36986 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
36987 SDValue N0 = N.getOperand(0);
36988 SDValue N1 = N.getOperand(1);
36989 unsigned EltSizeInBits = VT.getScalarSizeInBits();
36990 if (N0.getOpcode() == ISD::BITCAST &&
36991 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
36992 SDValue Src = N0.getOperand(0);
36993 EVT SrcVT = Src.getValueType();
36994 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
36995 return DAG.getBitcast(VT, Res);
36996 }
36997 return SDValue();
36998 }
36999 case X86ISD::VPERM2X128: {
37000 // If both 128-bit values were inserted into high halves of 256-bit values,
37001 // the shuffle can be reduced to a concatenation of subvectors:
37002 // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
37003 // Note: We are only looking for the exact high/high shuffle mask because we
37004 // expect to fold other similar patterns before creating this opcode.
37005 SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
37006 SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
37007 unsigned Imm = N.getConstantOperandVal(2);
37008 if (!(Imm == 0x31 &&
37009 Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
37010 Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
37011 Ins0.getValueType() == Ins1.getValueType()))
37012 return SDValue();
37013
37014 SDValue X = Ins0.getOperand(1);
37015 SDValue Y = Ins1.getOperand(1);
37016 unsigned C1 = Ins0.getConstantOperandVal(2);
37017 unsigned C2 = Ins1.getConstantOperandVal(2);
37018 MVT SrcVT = X.getSimpleValueType();
37019 unsigned SrcElts = SrcVT.getVectorNumElements();
37020 if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 ||
37021 C1 != SrcElts || C2 != SrcElts)
37022 return SDValue();
37023
37024 return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,
37025 Ins1.getValueType(), X, Y));
37026 }
37027 case X86ISD::PSHUFD:
37028 case X86ISD::PSHUFLW:
37029 case X86ISD::PSHUFHW:
37030 Mask = getPSHUFShuffleMask(N);
37031 assert(Mask.size() == 4)((Mask.size() == 4) ? static_cast<void> (0) : __assert_fail
("Mask.size() == 4", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37031, __PRETTY_FUNCTION__))
;
37032 break;
37033 case X86ISD::MOVSD:
37034 case X86ISD::MOVSS: {
37035 SDValue N0 = N.getOperand(0);
37036 SDValue N1 = N.getOperand(1);
37037
37038 // Canonicalize scalar FPOps:
37039 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
37040 // If commutable, allow OP(N1[0], N0[0]).
37041 unsigned Opcode1 = N1.getOpcode();
37042 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
37043 Opcode1 == ISD::FDIV) {
37044 SDValue N10 = N1.getOperand(0);
37045 SDValue N11 = N1.getOperand(1);
37046 if (N10 == N0 ||
37047 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
37048 if (N10 != N0)
37049 std::swap(N10, N11);
37050 MVT SVT = VT.getVectorElementType();
37051 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
37052 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
37053 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
37054 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
37055 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
37056 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
37057 }
37058 }
37059
37060 return SDValue();
37061 }
37062 case X86ISD::INSERTPS: {
37063 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")((VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37063, __PRETTY_FUNCTION__))
;
37064 SDValue Op0 = N.getOperand(0);
37065 SDValue Op1 = N.getOperand(1);
37066 unsigned InsertPSMask = N.getConstantOperandVal(2);
37067 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
37068 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
37069 unsigned ZeroMask = InsertPSMask & 0xF;
37070
37071 // If we zero out all elements from Op0 then we don't need to reference it.
37072 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
37073 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
37074 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37075
37076 // If we zero out the element from Op1 then we don't need to reference it.
37077 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
37078 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
37079 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37080
37081 // Attempt to merge insertps Op1 with an inner target shuffle node.
37082 SmallVector<int, 8> TargetMask1;
37083 SmallVector<SDValue, 2> Ops1;
37084 APInt KnownUndef1, KnownZero1;
37085 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
37086 KnownZero1)) {
37087 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
37088 // Zero/UNDEF insertion - zero out element and remove dependency.
37089 InsertPSMask |= (1u << DstIdx);
37090 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
37091 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37092 }
37093 // Update insertps mask srcidx and reference the source input directly.
37094 int M = TargetMask1[SrcIdx];
37095 assert(0 <= M && M < 8 && "Shuffle index out of range")((0 <= M && M < 8 && "Shuffle index out of range"
) ? static_cast<void> (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37095, __PRETTY_FUNCTION__))
;
37096 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
37097 Op1 = Ops1[M < 4 ? 0 : 1];
37098 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
37099 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37100 }
37101
37102 // Attempt to merge insertps Op0 with an inner target shuffle node.
37103 SmallVector<int, 8> TargetMask0;
37104 SmallVector<SDValue, 2> Ops0;
37105 APInt KnownUndef0, KnownZero0;
37106 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
37107 KnownZero0)) {
37108 bool Updated = false;
37109 bool UseInput00 = false;
37110 bool UseInput01 = false;
37111 for (int i = 0; i != 4; ++i) {
37112 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
37113 // No change if element is already zero or the inserted element.
37114 continue;
37115 } else if (KnownUndef0[i] || KnownZero0[i]) {
37116 // If the target mask is undef/zero then we must zero the element.
37117 InsertPSMask |= (1u << i);
37118 Updated = true;
37119 continue;
37120 }
37121
37122 // The input vector element must be inline.
37123 int M = TargetMask0[i];
37124 if (M != i && M != (i + 4))
37125 return SDValue();
37126
37127 // Determine which inputs of the target shuffle we're using.
37128 UseInput00 |= (0 <= M && M < 4);
37129 UseInput01 |= (4 <= M);
37130 }
37131
37132 // If we're not using both inputs of the target shuffle then use the
37133 // referenced input directly.
37134 if (UseInput00 && !UseInput01) {
37135 Updated = true;
37136 Op0 = Ops0[0];
37137 } else if (!UseInput00 && UseInput01) {
37138 Updated = true;
37139 Op0 = Ops0[1];
37140 }
37141
37142 if (Updated)
37143 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
37144 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
37145 }
37146
37147 // If we're inserting an element from a vbroadcast load, fold the
37148 // load into the X86insertps instruction. We need to convert the scalar
37149 // load to a vector and clear the source lane of the INSERTPS control.
37150 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
37151 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
37152 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
37153 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
37154 MemIntr->getBasePtr(),
37155 MemIntr->getMemOperand());
37156 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
37157 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
37158 Load),
37159 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
37160 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
37161 return Insert;
37162 }
37163 }
37164
37165 return SDValue();
37166 }
37167 default:
37168 return SDValue();
37169 }
37170
37171 // Nuke no-op shuffles that show up after combining.
37172 if (isNoopShuffleMask(Mask))
37173 return N.getOperand(0);
37174
37175 // Look for simplifications involving one or two shuffle instructions.
37176 SDValue V = N.getOperand(0);
37177 switch (N.getOpcode()) {
37178 default:
37179 break;
37180 case X86ISD::PSHUFLW:
37181 case X86ISD::PSHUFHW:
37182 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")((VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37182, __PRETTY_FUNCTION__))
;
37183
37184 // See if this reduces to a PSHUFD which is no more expensive and can
37185 // combine with more operations. Note that it has to at least flip the
37186 // dwords as otherwise it would have been removed as a no-op.
37187 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
37188 int DMask[] = {0, 1, 2, 3};
37189 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
37190 DMask[DOffset + 0] = DOffset + 1;
37191 DMask[DOffset + 1] = DOffset + 0;
37192 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
37193 V = DAG.getBitcast(DVT, V);
37194 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
37195 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
37196 return DAG.getBitcast(VT, V);
37197 }
37198
37199 // Look for shuffle patterns which can be implemented as a single unpack.
37200 // FIXME: This doesn't handle the location of the PSHUFD generically, and
37201 // only works when we have a PSHUFD followed by two half-shuffles.
37202 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
37203 (V.getOpcode() == X86ISD::PSHUFLW ||
37204 V.getOpcode() == X86ISD::PSHUFHW) &&
37205 V.getOpcode() != N.getOpcode() &&
37206 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
37207 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
37208 if (D.getOpcode() == X86ISD::PSHUFD) {
37209 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
37210 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
37211 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
37212 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
37213 int WordMask[8];
37214 for (int i = 0; i < 4; ++i) {
37215 WordMask[i + NOffset] = Mask[i] + NOffset;
37216 WordMask[i + VOffset] = VMask[i] + VOffset;
37217 }
37218 // Map the word mask through the DWord mask.
37219 int MappedMask[8];
37220 for (int i = 0; i < 8; ++i)
37221 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
37222 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
37223 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
37224 // We can replace all three shuffles with an unpack.
37225 V = DAG.getBitcast(VT, D.getOperand(0));
37226 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
37227 : X86ISD::UNPCKH,
37228 DL, VT, V, V);
37229 }
37230 }
37231 }
37232
37233 break;
37234
37235 case X86ISD::PSHUFD:
37236 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
37237 return NewN;
37238
37239 break;
37240 }
37241
37242 return SDValue();
37243}
37244
37245/// Checks if the shuffle mask takes subsequent elements
37246/// alternately from two vectors.
37247/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
37248static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
37249
37250 int ParitySrc[2] = {-1, -1};
37251 unsigned Size = Mask.size();
37252 for (unsigned i = 0; i != Size; ++i) {
37253 int M = Mask[i];
37254 if (M < 0)
37255 continue;
37256
37257 // Make sure we are using the matching element from the input.
37258 if ((M % Size) != i)
37259 return false;
37260
37261 // Make sure we use the same input for all elements of the same parity.
37262 int Src = M / Size;
37263 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
37264 return false;
37265 ParitySrc[i % 2] = Src;
37266 }
37267
37268 // Make sure each input is used.
37269 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
37270 return false;
37271
37272 Op0Even = ParitySrc[0] == 0;
37273 return true;
37274}
37275
37276/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
37277/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
37278/// are written to the parameters \p Opnd0 and \p Opnd1.
37279///
37280/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
37281/// so it is easier to generically match. We also insert dummy vector shuffle
37282/// nodes for the operands which explicitly discard the lanes which are unused
37283/// by this operation to try to flow through the rest of the combiner
37284/// the fact that they're unused.
37285static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
37286 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
37287 bool &IsSubAdd) {
37288
37289 EVT VT = N->getValueType(0);
37290 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37291 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
37292 !VT.getSimpleVT().isFloatingPoint())
37293 return false;
37294
37295 // We only handle target-independent shuffles.
37296 // FIXME: It would be easy and harmless to use the target shuffle mask
37297 // extraction tool to support more.
37298 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
37299 return false;
37300
37301 SDValue V1 = N->getOperand(0);
37302 SDValue V2 = N->getOperand(1);
37303
37304 // Make sure we have an FADD and an FSUB.
37305 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
37306 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
37307 V1.getOpcode() == V2.getOpcode())
37308 return false;
37309
37310 // If there are other uses of these operations we can't fold them.
37311 if (!V1->hasOneUse() || !V2->hasOneUse())
37312 return false;
37313
37314 // Ensure that both operations have the same operands. Note that we can
37315 // commute the FADD operands.
37316 SDValue LHS, RHS;
37317 if (V1.getOpcode() == ISD::FSUB) {
37318 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
37319 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
37320 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
37321 return false;
37322 } else {
37323 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")((V2.getOpcode() == ISD::FSUB && "Unexpected opcode")
? static_cast<void> (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37323, __PRETTY_FUNCTION__))
;
37324 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
37325 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
37326 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
37327 return false;
37328 }
37329
37330 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
37331 bool Op0Even;
37332 if (!isAddSubOrSubAddMask(Mask, Op0Even))
37333 return false;
37334
37335 // It's a subadd if the vector in the even parity is an FADD.
37336 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
37337 : V2->getOpcode() == ISD::FADD;
37338
37339 Opnd0 = LHS;
37340 Opnd1 = RHS;
37341 return true;
37342}
37343
37344/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
37345static SDValue combineShuffleToFMAddSub(SDNode *N,
37346 const X86Subtarget &Subtarget,
37347 SelectionDAG &DAG) {
37348 // We only handle target-independent shuffles.
37349 // FIXME: It would be easy and harmless to use the target shuffle mask
37350 // extraction tool to support more.
37351 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
37352 return SDValue();
37353
37354 MVT VT = N->getSimpleValueType(0);
37355 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37356 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
37357 return SDValue();
37358
37359 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
37360 SDValue Op0 = N->getOperand(0);
37361 SDValue Op1 = N->getOperand(1);
37362 SDValue FMAdd = Op0, FMSub = Op1;
37363 if (FMSub.getOpcode() != X86ISD::FMSUB)
37364 std::swap(FMAdd, FMSub);
37365
37366 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
37367 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
37368 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
37369 FMAdd.getOperand(2) != FMSub.getOperand(2))
37370 return SDValue();
37371
37372 // Check for correct shuffle mask.
37373 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
37374 bool Op0Even;
37375 if (!isAddSubOrSubAddMask(Mask, Op0Even))
37376 return SDValue();
37377
37378 // FMAddSub takes zeroth operand from FMSub node.
37379 SDLoc DL(N);
37380 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
37381 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
37382 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
37383 FMAdd.getOperand(2));
37384}
37385
37386/// Try to combine a shuffle into a target-specific add-sub or
37387/// mul-add-sub node.
37388static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
37389 const X86Subtarget &Subtarget,
37390 SelectionDAG &DAG) {
37391 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
37392 return V;
37393
37394 SDValue Opnd0, Opnd1;
37395 bool IsSubAdd;
37396 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
37397 return SDValue();
37398
37399 MVT VT = N->getSimpleValueType(0);
37400 SDLoc DL(N);
37401
37402 // Try to generate X86ISD::FMADDSUB node here.
37403 SDValue Opnd2;
37404 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
37405 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
37406 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
37407 }
37408
37409 if (IsSubAdd)
37410 return SDValue();
37411
37412 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
37413 // the ADDSUB idiom has been successfully recognized. There are no known
37414 // X86 targets with 512-bit ADDSUB instructions!
37415 if (VT.is512BitVector())
37416 return SDValue();
37417
37418 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
37419}
37420
37421// We are looking for a shuffle where both sources are concatenated with undef
37422// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
37423// if we can express this as a single-source shuffle, that's preferable.
37424static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
37425 const X86Subtarget &Subtarget) {
37426 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
37427 return SDValue();
37428
37429 EVT VT = N->getValueType(0);
37430
37431 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
37432 if (!VT.is128BitVector() && !VT.is256BitVector())
37433 return SDValue();
37434
37435 if (VT.getVectorElementType() != MVT::i32 &&
37436 VT.getVectorElementType() != MVT::i64 &&
37437 VT.getVectorElementType() != MVT::f32 &&
37438 VT.getVectorElementType() != MVT::f64)
37439 return SDValue();
37440
37441 SDValue N0 = N->getOperand(0);
37442 SDValue N1 = N->getOperand(1);
37443
37444 // Check that both sources are concats with undef.
37445 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
37446 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
37447 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
37448 !N1.getOperand(1).isUndef())
37449 return SDValue();
37450
37451 // Construct the new shuffle mask. Elements from the first source retain their
37452 // index, but elements from the second source no longer need to skip an undef.
37453 SmallVector<int, 8> Mask;
37454 int NumElts = VT.getVectorNumElements();
37455
37456 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
37457 for (int Elt : SVOp->getMask())
37458 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
37459
37460 SDLoc DL(N);
37461 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
37462 N1.getOperand(0));
37463 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
37464}
37465
37466/// Eliminate a redundant shuffle of a horizontal math op.
37467static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
37468 unsigned Opcode = N->getOpcode();
37469 if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
37470 if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
37471 return SDValue();
37472
37473 // For a broadcast, peek through an extract element of index 0 to find the
37474 // horizontal op: broadcast (ext_vec_elt HOp, 0)
37475 EVT VT = N->getValueType(0);
37476 if (Opcode == X86ISD::VBROADCAST) {
37477 SDValue SrcOp = N->getOperand(0);
37478 if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
37479 SrcOp.getValueType() == MVT::f64 &&
37480 SrcOp.getOperand(0).getValueType() == VT &&
37481 isNullConstant(SrcOp.getOperand(1)))
37482 N = SrcOp.getNode();
37483 }
37484
37485 SDValue HOp = N->getOperand(0);
37486 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
37487 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
37488 return SDValue();
37489
37490 // 128-bit horizontal math instructions are defined to operate on adjacent
37491 // lanes of each operand as:
37492 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
37493 // ...similarly for v2f64 and v8i16.
37494 if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
37495 HOp.getOperand(0) != HOp.getOperand(1))
37496 return SDValue();
37497
37498 // The shuffle that we are eliminating may have allowed the horizontal op to
37499 // have an undemanded (undefined) operand. Duplicate the other (defined)
37500 // operand to ensure that the results are defined across all lanes without the
37501 // shuffle.
37502 auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
37503 SDValue X;
37504 if (HorizOp.getOperand(0).isUndef()) {
37505 assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op")((!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op"
) ? static_cast<void> (0) : __assert_fail ("!HorizOp.getOperand(1).isUndef() && \"Not expecting foldable h-op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37505, __PRETTY_FUNCTION__))
;
37506 X = HorizOp.getOperand(1);
37507 } else if (HorizOp.getOperand(1).isUndef()) {
37508 assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op")((!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op"
) ? static_cast<void> (0) : __assert_fail ("!HorizOp.getOperand(0).isUndef() && \"Not expecting foldable h-op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37508, __PRETTY_FUNCTION__))
;
37509 X = HorizOp.getOperand(0);
37510 } else {
37511 return HorizOp;
37512 }
37513 return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
37514 HorizOp.getValueType(), X, X);
37515 };
37516
37517 // When the operands of a horizontal math op are identical, the low half of
37518 // the result is the same as the high half. If a target shuffle is also
37519 // replicating low and high halves (and without changing the type/length of
37520 // the vector), we don't need the shuffle.
37521 if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
37522 if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
37523 // movddup (hadd X, X) --> hadd X, X
37524 // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
37525 assert((HOp.getValueType() == MVT::v2f64 ||(((HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT
::v4f64) && "Unexpected type for h-op") ? static_cast
<void> (0) : __assert_fail ("(HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT::v4f64) && \"Unexpected type for h-op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37526, __PRETTY_FUNCTION__))
37526 HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op")(((HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT
::v4f64) && "Unexpected type for h-op") ? static_cast
<void> (0) : __assert_fail ("(HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT::v4f64) && \"Unexpected type for h-op\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37526, __PRETTY_FUNCTION__))
;
37527 return updateHOp(HOp, DAG);
37528 }
37529 return SDValue();
37530 }
37531
37532 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
37533 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
37534
37535 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
37536 // but this should be tied to whatever horizontal op matching and shuffle
37537 // canonicalization are producing.
37538 if (HOp.getValueSizeInBits() == 128 &&
37539 (isShuffleEquivalent(Mask, {0, 0}) ||
37540 isShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
37541 isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
37542 return updateHOp(HOp, DAG);
37543
37544 if (HOp.getValueSizeInBits() == 256 &&
37545 (isShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
37546 isShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
37547 isShuffleEquivalent(
37548 Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
37549 return updateHOp(HOp, DAG);
37550
37551 return SDValue();
37552}
37553
37554/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
37555/// low half of each source vector and does not set any high half elements in
37556/// the destination vector, narrow the shuffle to half its original size.
37557static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
37558 if (!Shuf->getValueType(0).isSimple())
37559 return SDValue();
37560 MVT VT = Shuf->getSimpleValueType(0);
37561 if (!VT.is256BitVector() && !VT.is512BitVector())
37562 return SDValue();
37563
37564 // See if we can ignore all of the high elements of the shuffle.
37565 ArrayRef<int> Mask = Shuf->getMask();
37566 if (!isUndefUpperHalf(Mask))
37567 return SDValue();
37568
37569 // Check if the shuffle mask accesses only the low half of each input vector
37570 // (half-index output is 0 or 2).
37571 int HalfIdx1, HalfIdx2;
37572 SmallVector<int, 8> HalfMask(Mask.size() / 2);
37573 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
37574 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
37575 return SDValue();
37576
37577 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
37578 // The trick is knowing that all of the insert/extract are actually free
37579 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
37580 // of narrow inputs into a narrow output, and that is always cheaper than
37581 // the wide shuffle that we started with.
37582 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
37583 Shuf->getOperand(1), HalfMask, HalfIdx1,
37584 HalfIdx2, false, DAG, /*UseConcat*/true);
37585}
37586
37587static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
37588 TargetLowering::DAGCombinerInfo &DCI,
37589 const X86Subtarget &Subtarget) {
37590 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
37591 if (SDValue V = narrowShuffle(Shuf, DAG))
37592 return V;
37593
37594 // If we have legalized the vector types, look for blends of FADD and FSUB
37595 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
37596 SDLoc dl(N);
37597 EVT VT = N->getValueType(0);
37598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37599 if (TLI.isTypeLegal(VT)) {
37600 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
37601 return AddSub;
37602
37603 if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
37604 return HAddSub;
37605 }
37606
37607 // Attempt to combine into a vector load/broadcast.
37608 if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,
37609 Subtarget, true))
37610 return LD;
37611
37612 // For AVX2, we sometimes want to combine
37613 // (vector_shuffle <mask> (concat_vectors t1, undef)
37614 // (concat_vectors t2, undef))
37615 // Into:
37616 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
37617 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
37618 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
37619 return ShufConcat;
37620
37621 if (isTargetShuffle(N->getOpcode())) {
37622 SDValue Op(N, 0);
37623 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
37624 return Shuffle;
37625
37626 // Try recursively combining arbitrary sequences of x86 shuffle
37627 // instructions into higher-order shuffles. We do this after combining
37628 // specific PSHUF instruction sequences into their minimal form so that we
37629 // can evaluate how many specialized shuffle instructions are involved in
37630 // a particular chain.
37631 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
37632 return Res;
37633
37634 // Simplify source operands based on shuffle mask.
37635 // TODO - merge this into combineX86ShufflesRecursively.
37636 APInt KnownUndef, KnownZero;
37637 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
37638 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
37639 DCI))
37640 return SDValue(N, 0);
37641 }
37642
37643 return SDValue();
37644}
37645
37646// Simplify variable target shuffle masks based on the demanded elements.
37647// TODO: Handle DemandedBits in mask indices as well?
37648bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
37649 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
37650 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
37651 // If we're demanding all elements don't bother trying to simplify the mask.
37652 unsigned NumElts = DemandedElts.getBitWidth();
37653 if (DemandedElts.isAllOnesValue())
37654 return false;
37655
37656 SDValue Mask = Op.getOperand(MaskIndex);
37657 if (!Mask.hasOneUse())
37658 return false;
37659
37660 // Attempt to generically simplify the variable shuffle mask.
37661 APInt MaskUndef, MaskZero;
37662 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
37663 Depth + 1))
37664 return true;
37665
37666 // Attempt to extract+simplify a (constant pool load) shuffle mask.
37667 // TODO: Support other types from getTargetShuffleMaskIndices?
37668 SDValue BC = peekThroughOneUseBitcasts(Mask);
37669 EVT BCVT = BC.getValueType();
37670 auto *Load = dyn_cast<LoadSDNode>(BC);
37671 if (!Load)
37672 return false;
37673
37674 const Constant *C = getTargetConstantFromNode(Load);
37675 if (!C)
37676 return false;
37677
37678 Type *CTy = C->getType();
37679 if (!CTy->isVectorTy() ||
37680 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
37681 return false;
37682
37683 // Handle scaling for i64 elements on 32-bit targets.
37684 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
37685 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
37686 return false;
37687 unsigned Scale = NumCstElts / NumElts;
37688
37689 // Simplify mask if we have an undemanded element that is not undef.
37690 bool Simplified = false;
37691 SmallVector<Constant *, 32> ConstVecOps;
37692 for (unsigned i = 0; i != NumCstElts; ++i) {
37693 Constant *Elt = C->getAggregateElement(i);
37694 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
37695 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
37696 Simplified = true;
37697 continue;
37698 }
37699 ConstVecOps.push_back(Elt);
37700 }
37701 if (!Simplified)
37702 return false;
37703
37704 // Generate new constant pool entry + legalize immediately for the load.
37705 SDLoc DL(Op);
37706 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
37707 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
37708 SDValue NewMask = TLO.DAG.getLoad(
37709 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
37710 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
37711 Load->getAlign());
37712 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
37713}
37714
37715bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
37716 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
37717 TargetLoweringOpt &TLO, unsigned Depth) const {
37718 int NumElts = DemandedElts.getBitWidth();
37719 unsigned Opc = Op.getOpcode();
37720 EVT VT = Op.getValueType();
37721
37722 // Handle special case opcodes.
37723 switch (Opc) {
37724 case X86ISD::PMULDQ:
37725 case X86ISD::PMULUDQ: {
37726 APInt LHSUndef, LHSZero;
37727 APInt RHSUndef, RHSZero;
37728 SDValue LHS = Op.getOperand(0);
37729 SDValue RHS = Op.getOperand(1);
37730 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
37731 Depth + 1))
37732 return true;
37733 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
37734 Depth + 1))
37735 return true;
37736 // Multiply by zero.
37737 KnownZero = LHSZero | RHSZero;
37738 break;
37739 }
37740 case X86ISD::VSHL:
37741 case X86ISD::VSRL:
37742 case X86ISD::VSRA: {
37743 // We only need the bottom 64-bits of the (128-bit) shift amount.
37744 SDValue Amt = Op.getOperand(1);
37745 MVT AmtVT = Amt.getSimpleValueType();
37746 assert(AmtVT.is128BitVector() && "Unexpected value type")((AmtVT.is128BitVector() && "Unexpected value type") ?
static_cast<void> (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37746, __PRETTY_FUNCTION__))
;
37747
37748 // If we reuse the shift amount just for sse shift amounts then we know that
37749 // only the bottom 64-bits are only ever used.
37750 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
37751 unsigned UseOpc = Use->getOpcode();
37752 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
37753 UseOpc == X86ISD::VSRA) &&
37754 Use->getOperand(0) != Amt;
37755 });
37756
37757 APInt AmtUndef, AmtZero;
37758 unsigned NumAmtElts = AmtVT.getVectorNumElements();
37759 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
37760 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
37761 Depth + 1, AssumeSingleUse))
37762 return true;
37763 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37764 }
37765 case X86ISD::VSHLI:
37766 case X86ISD::VSRLI:
37767 case X86ISD::VSRAI: {
37768 SDValue Src = Op.getOperand(0);
37769 APInt SrcUndef;
37770 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
37771 Depth + 1))
37772 return true;
37773
37774 // Aggressively peek through ops to get at the demanded elts.
37775 if (!DemandedElts.isAllOnesValue())
37776 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
37777 Src, DemandedElts, TLO.DAG, Depth + 1))
37778 return TLO.CombineTo(
37779 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
37780 break;
37781 }
37782 case X86ISD::KSHIFTL: {
37783 SDValue Src = Op.getOperand(0);
37784 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
37785 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"
) ? static_cast<void> (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37785, __PRETTY_FUNCTION__))
;
37786 unsigned ShiftAmt = Amt->getZExtValue();
37787
37788 if (ShiftAmt == 0)
37789 return TLO.CombineTo(Op, Src);
37790
37791 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
37792 // single shift. We can do this if the bottom bits (which are shifted
37793 // out) are never demanded.
37794 if (Src.getOpcode() == X86ISD::KSHIFTR) {
37795 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
37796 unsigned C1 = Src.getConstantOperandVal(1);
37797 unsigned NewOpc = X86ISD::KSHIFTL;
37798 int Diff = ShiftAmt - C1;
37799 if (Diff < 0) {
37800 Diff = -Diff;
37801 NewOpc = X86ISD::KSHIFTR;
37802 }
37803
37804 SDLoc dl(Op);
37805 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
37806 return TLO.CombineTo(
37807 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
37808 }
37809 }
37810
37811 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
37812 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
37813 Depth + 1))
37814 return true;
37815
37816 KnownUndef <<= ShiftAmt;
37817 KnownZero <<= ShiftAmt;
37818 KnownZero.setLowBits(ShiftAmt);
37819 break;
37820 }
37821 case X86ISD::KSHIFTR: {
37822 SDValue Src = Op.getOperand(0);
37823 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
37824 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"
) ? static_cast<void> (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37824, __PRETTY_FUNCTION__))
;
37825 unsigned ShiftAmt = Amt->getZExtValue();
37826
37827 if (ShiftAmt == 0)
37828 return TLO.CombineTo(Op, Src);
37829
37830 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
37831 // single shift. We can do this if the top bits (which are shifted
37832 // out) are never demanded.
37833 if (Src.getOpcode() == X86ISD::KSHIFTL) {
37834 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
37835 unsigned C1 = Src.getConstantOperandVal(1);
37836 unsigned NewOpc = X86ISD::KSHIFTR;
37837 int Diff = ShiftAmt - C1;
37838 if (Diff < 0) {
37839 Diff = -Diff;
37840 NewOpc = X86ISD::KSHIFTL;
37841 }
37842
37843 SDLoc dl(Op);
37844 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
37845 return TLO.CombineTo(
37846 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
37847 }
37848 }
37849
37850 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
37851 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
37852 Depth + 1))
37853 return true;
37854
37855 KnownUndef.lshrInPlace(ShiftAmt);
37856 KnownZero.lshrInPlace(ShiftAmt);
37857 KnownZero.setHighBits(ShiftAmt);
37858 break;
37859 }
37860 case X86ISD::CVTSI2P:
37861 case X86ISD::CVTUI2P: {
37862 SDValue Src = Op.getOperand(0);
37863 MVT SrcVT = Src.getSimpleValueType();
37864 APInt SrcUndef, SrcZero;
37865 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
37866 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
37867 Depth + 1))
37868 return true;
37869 break;
37870 }
37871 case X86ISD::PACKSS:
37872 case X86ISD::PACKUS: {
37873 SDValue N0 = Op.getOperand(0);
37874 SDValue N1 = Op.getOperand(1);
37875
37876 APInt DemandedLHS, DemandedRHS;
37877 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
37878
37879 APInt SrcUndef, SrcZero;
37880 if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
37881 Depth + 1))
37882 return true;
37883 if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
37884 Depth + 1))
37885 return true;
37886
37887 // Aggressively peek through ops to get at the demanded elts.
37888 // TODO - we should do this for all target/faux shuffles ops.
37889 if (!DemandedElts.isAllOnesValue()) {
37890 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
37891 TLO.DAG, Depth + 1);
37892 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
37893 TLO.DAG, Depth + 1);
37894 if (NewN0 || NewN1) {
37895 NewN0 = NewN0 ? NewN0 : N0;
37896 NewN1 = NewN1 ? NewN1 : N1;
37897 return TLO.CombineTo(Op,
37898 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
37899 }
37900 }
37901 break;
37902 }
37903 case X86ISD::HADD:
37904 case X86ISD::HSUB:
37905 case X86ISD::FHADD:
37906 case X86ISD::FHSUB: {
37907 APInt DemandedLHS, DemandedRHS;
37908 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
37909
37910 APInt LHSUndef, LHSZero;
37911 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
37912 LHSZero, TLO, Depth + 1))
37913 return true;
37914 APInt RHSUndef, RHSZero;
37915 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
37916 RHSZero, TLO, Depth + 1))
37917 return true;
37918 break;
37919 }
37920 case X86ISD::VTRUNC:
37921 case X86ISD::VTRUNCS:
37922 case X86ISD::VTRUNCUS: {
37923 SDValue Src = Op.getOperand(0);
37924 MVT SrcVT = Src.getSimpleValueType();
37925 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
37926 APInt SrcUndef, SrcZero;
37927 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
37928 Depth + 1))
37929 return true;
37930 KnownZero = SrcZero.zextOrTrunc(NumElts);
37931 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
37932 break;
37933 }
37934 case X86ISD::BLENDV: {
37935 APInt SelUndef, SelZero;
37936 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
37937 SelZero, TLO, Depth + 1))
37938 return true;
37939
37940 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
37941 APInt LHSUndef, LHSZero;
37942 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
37943 LHSZero, TLO, Depth + 1))
37944 return true;
37945
37946 APInt RHSUndef, RHSZero;
37947 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
37948 RHSZero, TLO, Depth + 1))
37949 return true;
37950
37951 KnownZero = LHSZero & RHSZero;
37952 KnownUndef = LHSUndef & RHSUndef;
37953 break;
37954 }
37955 case X86ISD::VZEXT_MOVL: {
37956 // If upper demanded elements are already zero then we have nothing to do.
37957 SDValue Src = Op.getOperand(0);
37958 APInt DemandedUpperElts = DemandedElts;
37959 DemandedUpperElts.clearLowBits(1);
37960 if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
37961 return TLO.CombineTo(Op, Src);
37962 break;
37963 }
37964 case X86ISD::VBROADCAST: {
37965 SDValue Src = Op.getOperand(0);
37966 MVT SrcVT = Src.getSimpleValueType();
37967 if (!SrcVT.isVector())
37968 return false;
37969 // Don't bother broadcasting if we just need the 0'th element.
37970 if (DemandedElts == 1) {
37971 if (Src.getValueType() != VT)
37972 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
37973 SDLoc(Op));
37974 return TLO.CombineTo(Op, Src);
37975 }
37976 APInt SrcUndef, SrcZero;
37977 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
37978 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
37979 Depth + 1))
37980 return true;
37981 // Aggressively peek through src to get at the demanded elt.
37982 // TODO - we should do this for all target/faux shuffles ops.
37983 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
37984 Src, SrcElts, TLO.DAG, Depth + 1))
37985 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
37986 break;
37987 }
37988 case X86ISD::VPERMV:
37989 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
37990 Depth))
37991 return true;
37992 break;
37993 case X86ISD::PSHUFB:
37994 case X86ISD::VPERMV3:
37995 case X86ISD::VPERMILPV:
37996 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
37997 Depth))
37998 return true;
37999 break;
38000 case X86ISD::VPPERM:
38001 case X86ISD::VPERMIL2:
38002 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
38003 Depth))
38004 return true;
38005 break;
38006 }
38007
38008 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
38009 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
38010 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
38011 if ((VT.is256BitVector() || VT.is512BitVector()) &&
38012 DemandedElts.lshr(NumElts / 2) == 0) {
38013 unsigned SizeInBits = VT.getSizeInBits();
38014 unsigned ExtSizeInBits = SizeInBits / 2;
38015
38016 // See if 512-bit ops only use the bottom 128-bits.
38017 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
38018 ExtSizeInBits = SizeInBits / 4;
38019
38020 switch (Opc) {
38021 // Subvector broadcast.
38022 case X86ISD::SUBV_BROADCAST: {
38023 SDLoc DL(Op);
38024 SDValue Src = Op.getOperand(0);
38025 if (Src.getValueSizeInBits() > ExtSizeInBits)
38026 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
38027 else if (Src.getValueSizeInBits() < ExtSizeInBits) {
38028 MVT SrcSVT = Src.getSimpleValueType().getScalarType();
38029 MVT SrcVT =
38030 MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
38031 Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
38032 }
38033 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
38034 TLO.DAG, DL, ExtSizeInBits));
38035 }
38036 // Byte shifts by immediate.
38037 case X86ISD::VSHLDQ:
38038 case X86ISD::VSRLDQ:
38039 // Shift by uniform.
38040 case X86ISD::VSHL:
38041 case X86ISD::VSRL:
38042 case X86ISD::VSRA:
38043 // Shift by immediate.
38044 case X86ISD::VSHLI:
38045 case X86ISD::VSRLI:
38046 case X86ISD::VSRAI: {
38047 SDLoc DL(Op);
38048 SDValue Ext0 =
38049 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
38050 SDValue ExtOp =
38051 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
38052 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
38053 SDValue Insert =
38054 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
38055 return TLO.CombineTo(Op, Insert);
38056 }
38057 case X86ISD::VPERMI: {
38058 // Simplify PERMPD/PERMQ to extract_subvector.
38059 // TODO: This should be done in shuffle combining.
38060 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
38061 SmallVector<int, 4> Mask;
38062 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
38063 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
38064 SDLoc DL(Op);
38065 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
38066 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
38067 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
38068 return TLO.CombineTo(Op, Insert);
38069 }
38070 }
38071 break;
38072 }
38073 // Zero upper elements.
38074 case X86ISD::VZEXT_MOVL:
38075 // Target unary shuffles by immediate:
38076 case X86ISD::PSHUFD:
38077 case X86ISD::PSHUFLW:
38078 case X86ISD::PSHUFHW:
38079 case X86ISD::VPERMILPI:
38080 // (Non-Lane Crossing) Target Shuffles.
38081 case X86ISD::VPERMILPV:
38082 case X86ISD::VPERMIL2:
38083 case X86ISD::PSHUFB:
38084 case X86ISD::UNPCKL:
38085 case X86ISD::UNPCKH:
38086 case X86ISD::BLENDI:
38087 // Integer ops.
38088 case X86ISD::AVG:
38089 case X86ISD::PACKSS:
38090 case X86ISD::PACKUS:
38091 // Horizontal Ops.
38092 case X86ISD::HADD:
38093 case X86ISD::HSUB:
38094 case X86ISD::FHADD:
38095 case X86ISD::FHSUB: {
38096 SDLoc DL(Op);
38097 SmallVector<SDValue, 4> Ops;
38098 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
38099 SDValue SrcOp = Op.getOperand(i);
38100 EVT SrcVT = SrcOp.getValueType();
38101 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
"Unsupported vector size") ? static_cast<void> (0) : __assert_fail
("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38102, __PRETTY_FUNCTION__))
38102 "Unsupported vector size")(((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
"Unsupported vector size") ? static_cast<void> (0) : __assert_fail
("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38102, __PRETTY_FUNCTION__))
;
38103 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
38104 ExtSizeInBits)
38105 : SrcOp);
38106 }
38107 MVT ExtVT = VT.getSimpleVT();
38108 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
38109 ExtSizeInBits / ExtVT.getScalarSizeInBits());
38110 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
38111 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
38112 SDValue Insert =
38113 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
38114 return TLO.CombineTo(Op, Insert);
38115 }
38116 }
38117 }
38118
38119 // Get target/faux shuffle mask.
38120 APInt OpUndef, OpZero;
38121 SmallVector<int, 64> OpMask;
38122 SmallVector<SDValue, 2> OpInputs;
38123 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
38124 OpZero, TLO.DAG, Depth, false))
38125 return false;
38126
38127 // Shuffle inputs must be the same size as the result.
38128 if (OpMask.size() != (unsigned)NumElts ||
38129 llvm::any_of(OpInputs, [VT](SDValue V) {
38130 return VT.getSizeInBits() != V.getValueSizeInBits() ||
38131 !V.getValueType().isVector();
38132 }))
38133 return false;
38134
38135 KnownZero = OpZero;
38136 KnownUndef = OpUndef;
38137
38138 // Check if shuffle mask can be simplified to undef/zero/identity.
38139 int NumSrcs = OpInputs.size();
38140 for (int i = 0; i != NumElts; ++i)
38141 if (!DemandedElts[i])
38142 OpMask[i] = SM_SentinelUndef;
38143
38144 if (isUndefInRange(OpMask, 0, NumElts)) {
38145 KnownUndef.setAllBits();
38146 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
38147 }
38148 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
38149 KnownZero.setAllBits();
38150 return TLO.CombineTo(
38151 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
38152 }
38153 for (int Src = 0; Src != NumSrcs; ++Src)
38154 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
38155 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
38156
38157 // Attempt to simplify inputs.
38158 for (int Src = 0; Src != NumSrcs; ++Src) {
38159 // TODO: Support inputs of different types.
38160 if (OpInputs[Src].getValueType() != VT)
38161 continue;
38162
38163 int Lo = Src * NumElts;
38164 APInt SrcElts = APInt::getNullValue(NumElts);
38165 for (int i = 0; i != NumElts; ++i)
38166 if (DemandedElts[i]) {
38167 int M = OpMask[i] - Lo;
38168 if (0 <= M && M < NumElts)
38169 SrcElts.setBit(M);
38170 }
38171
38172 // TODO - Propagate input undef/zero elts.
38173 APInt SrcUndef, SrcZero;
38174 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
38175 TLO, Depth + 1))
38176 return true;
38177 }
38178
38179 // If we don't demand all elements, then attempt to combine to a simpler
38180 // shuffle.
38181 // We need to convert the depth to something combineX86ShufflesRecursively
38182 // can handle - so pretend its Depth == 0 again, and reduce the max depth
38183 // to match. This prevents combineX86ShuffleChain from returning a
38184 // combined shuffle that's the same as the original root, causing an
38185 // infinite loop.
38186 if (!DemandedElts.isAllOnesValue()) {
38187 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")((Depth < X86::MaxShuffleCombineDepth && "Depth out of range"
) ? static_cast<void> (0) : __assert_fail ("Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38187, __PRETTY_FUNCTION__))
;
38188
38189 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
38190 for (int i = 0; i != NumElts; ++i)
38191 if (DemandedElts[i])
38192 DemandedMask[i] = i;
38193
38194 SDValue NewShuffle = combineX86ShufflesRecursively(
38195 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
38196 /*HasVarMask*/ false,
38197 /*AllowVarMask*/ true, TLO.DAG, Subtarget);
38198 if (NewShuffle)
38199 return TLO.CombineTo(Op, NewShuffle);
38200 }
38201
38202 return false;
38203}
38204
38205bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
38206 SDValue Op, const APInt &OriginalDemandedBits,
38207 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
38208 unsigned Depth) const {
38209 EVT VT = Op.getValueType();
38210 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
38211 unsigned Opc = Op.getOpcode();
38212 switch(Opc) {
38213 case X86ISD::VTRUNC: {
38214 KnownBits KnownOp;
38215 SDValue Src = Op.getOperand(0);
38216 MVT SrcVT = Src.getSimpleValueType();
38217
38218 // Simplify the input, using demanded bit information.
38219 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
38220 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
38221 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
38222 return true;
38223 break;
38224 }
38225 case X86ISD::PMULDQ:
38226 case X86ISD::PMULUDQ: {
38227 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
38228 KnownBits KnownOp;
38229 SDValue LHS = Op.getOperand(0);
38230 SDValue RHS = Op.getOperand(1);
38231 // FIXME: Can we bound this better?
38232 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
38233 if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
38234 TLO, Depth + 1))
38235 return true;
38236 if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
38237 TLO, Depth + 1))
38238 return true;
38239
38240 // Aggressively peek through ops to get at the demanded low bits.
38241 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
38242 LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
38243 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
38244 RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
38245 if (DemandedLHS || DemandedRHS) {
38246 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
38247 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
38248 return TLO.CombineTo(
38249 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
38250 }
38251 break;
38252 }
38253 case X86ISD::VSHLI: {
38254 SDValue Op0 = Op.getOperand(0);
38255
38256 unsigned ShAmt = Op.getConstantOperandVal(1);
38257 if (ShAmt >= BitWidth)
38258 break;
38259
38260 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
38261
38262 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
38263 // single shift. We can do this if the bottom bits (which are shifted
38264 // out) are never demanded.
38265 if (Op0.getOpcode() == X86ISD::VSRLI &&
38266 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
38267 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
38268 if (Shift2Amt < BitWidth) {
38269 int Diff = ShAmt - Shift2Amt;
38270 if (Diff == 0)
38271 return TLO.CombineTo(Op, Op0.getOperand(0));
38272
38273 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
38274 SDValue NewShift = TLO.DAG.getNode(
38275 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
38276 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
38277 return TLO.CombineTo(Op, NewShift);
38278 }
38279 }
38280
38281 // If we are only demanding sign bits then we can use the shift source directly.
38282 unsigned NumSignBits =
38283 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
38284 unsigned UpperDemandedBits =
38285 BitWidth - OriginalDemandedBits.countTrailingZeros();
38286 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
38287 return TLO.CombineTo(Op, Op0);
38288
38289 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
38290 TLO, Depth + 1))
38291 return true;
38292
38293 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38293, __PRETTY_FUNCTION__))
;
38294 Known.Zero <<= ShAmt;
38295 Known.One <<= ShAmt;
38296
38297 // Low bits known zero.
38298 Known.Zero.setLowBits(ShAmt);
38299 return false;
38300 }
38301 case X86ISD::VSRLI: {
38302 unsigned ShAmt = Op.getConstantOperandVal(1);
38303 if (ShAmt >= BitWidth)
38304 break;
38305
38306 APInt DemandedMask = OriginalDemandedBits << ShAmt;
38307
38308 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
38309 OriginalDemandedElts, Known, TLO, Depth + 1))
38310 return true;
38311
38312 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38312, __PRETTY_FUNCTION__))
;
38313 Known.Zero.lshrInPlace(ShAmt);
38314 Known.One.lshrInPlace(ShAmt);
38315
38316 // High bits known zero.
38317 Known.Zero.setHighBits(ShAmt);
38318 return false;
38319 }
38320 case X86ISD::VSRAI: {
38321 SDValue Op0 = Op.getOperand(0);
38322 SDValue Op1 = Op.getOperand(1);
38323
38324 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
38325 if (ShAmt >= BitWidth)
38326 break;
38327
38328 APInt DemandedMask = OriginalDemandedBits << ShAmt;
38329
38330 // If we just want the sign bit then we don't need to shift it.
38331 if (OriginalDemandedBits.isSignMask())
38332 return TLO.CombineTo(Op, Op0);
38333
38334 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
38335 if (Op0.getOpcode() == X86ISD::VSHLI &&
38336 Op.getOperand(1) == Op0.getOperand(1)) {
38337 SDValue Op00 = Op0.getOperand(0);
38338 unsigned NumSignBits =
38339 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
38340 if (ShAmt < NumSignBits)
38341 return TLO.CombineTo(Op, Op00);
38342 }
38343
38344 // If any of the demanded bits are produced by the sign extension, we also
38345 // demand the input sign bit.
38346 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
38347 DemandedMask.setSignBit();
38348
38349 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
38350 TLO, Depth + 1))
38351 return true;
38352
38353 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38353, __PRETTY_FUNCTION__))
;
38354 Known.Zero.lshrInPlace(ShAmt);
38355 Known.One.lshrInPlace(ShAmt);
38356
38357 // If the input sign bit is known to be zero, or if none of the top bits
38358 // are demanded, turn this into an unsigned shift right.
38359 if (Known.Zero[BitWidth - ShAmt - 1] ||
38360 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
38361 return TLO.CombineTo(
38362 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
38363
38364 // High bits are known one.
38365 if (Known.One[BitWidth - ShAmt - 1])
38366 Known.One.setHighBits(ShAmt);
38367 return false;
38368 }
38369 case X86ISD::PEXTRB:
38370 case X86ISD::PEXTRW: {
38371 SDValue Vec = Op.getOperand(0);
38372 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38373 MVT VecVT = Vec.getSimpleValueType();
38374 unsigned NumVecElts = VecVT.getVectorNumElements();
38375
38376 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
38377 unsigned Idx = CIdx->getZExtValue();
38378 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
38379
38380 // If we demand no bits from the vector then we must have demanded
38381 // bits from the implict zext - simplify to zero.
38382 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
38383 if (DemandedVecBits == 0)
38384 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
38385
38386 APInt KnownUndef, KnownZero;
38387 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
38388 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
38389 KnownZero, TLO, Depth + 1))
38390 return true;
38391
38392 KnownBits KnownVec;
38393 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
38394 KnownVec, TLO, Depth + 1))
38395 return true;
38396
38397 if (SDValue V = SimplifyMultipleUseDemandedBits(
38398 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
38399 return TLO.CombineTo(
38400 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
38401
38402 Known = KnownVec.zext(BitWidth);
38403 return false;
38404 }
38405 break;
38406 }
38407 case X86ISD::PINSRB:
38408 case X86ISD::PINSRW: {
38409 SDValue Vec = Op.getOperand(0);
38410 SDValue Scl = Op.getOperand(1);
38411 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
38412 MVT VecVT = Vec.getSimpleValueType();
38413
38414 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
38415 unsigned Idx = CIdx->getZExtValue();
38416 if (!OriginalDemandedElts[Idx])
38417 return TLO.CombineTo(Op, Vec);
38418
38419 KnownBits KnownVec;
38420 APInt DemandedVecElts(OriginalDemandedElts);
38421 DemandedVecElts.clearBit(Idx);
38422 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
38423 KnownVec, TLO, Depth + 1))
38424 return true;
38425
38426 KnownBits KnownScl;
38427 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
38428 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
38429 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
38430 return true;
38431
38432 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
38433 Known = KnownBits::commonBits(KnownVec, KnownScl);
38434 return false;
38435 }
38436 break;
38437 }
38438 case X86ISD::PACKSS:
38439 // PACKSS saturates to MIN/MAX integer values. So if we just want the
38440 // sign bit then we can just ask for the source operands sign bit.
38441 // TODO - add known bits handling.
38442 if (OriginalDemandedBits.isSignMask()) {
38443 APInt DemandedLHS, DemandedRHS;
38444 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
38445
38446 KnownBits KnownLHS, KnownRHS;
38447 APInt SignMask = APInt::getSignMask(BitWidth * 2);
38448 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
38449 KnownLHS, TLO, Depth + 1))
38450 return true;
38451 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
38452 KnownRHS, TLO, Depth + 1))
38453 return true;
38454
38455 // Attempt to avoid multi-use ops if we don't need anything from them.
38456 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
38457 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
38458 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
38459 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
38460 if (DemandedOp0 || DemandedOp1) {
38461 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
38462 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
38463 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
38464 }
38465 }
38466 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
38467 break;
38468 case X86ISD::PCMPGT:
38469 // icmp sgt(0, R) == ashr(R, BitWidth-1).
38470 // iff we only need the sign bit then we can use R directly.
38471 if (OriginalDemandedBits.isSignMask() &&
38472 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
38473 return TLO.CombineTo(Op, Op.getOperand(1));
38474 break;
38475 case X86ISD::MOVMSK: {
38476 SDValue Src = Op.getOperand(0);
38477 MVT SrcVT = Src.getSimpleValueType();
38478 unsigned SrcBits = SrcVT.getScalarSizeInBits();
38479 unsigned NumElts = SrcVT.getVectorNumElements();
38480
38481 // If we don't need the sign bits at all just return zero.
38482 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
38483 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
38484
38485 // Only demand the vector elements of the sign bits we need.
38486 APInt KnownUndef, KnownZero;
38487 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
38488 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
38489 TLO, Depth + 1))
38490 return true;
38491
38492 Known.Zero = KnownZero.zextOrSelf(BitWidth);
38493 Known.Zero.setHighBits(BitWidth - NumElts);
38494
38495 // MOVMSK only uses the MSB from each vector element.
38496 KnownBits KnownSrc;
38497 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
38498 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
38499 Depth + 1))
38500 return true;
38501
38502 if (KnownSrc.One[SrcBits - 1])
38503 Known.One.setLowBits(NumElts);
38504 else if (KnownSrc.Zero[SrcBits - 1])
38505 Known.Zero.setLowBits(NumElts);
38506
38507 // Attempt to avoid multi-use os if we don't need anything from it.
38508 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
38509 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
38510 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
38511 return false;
38512 }
38513 case X86ISD::BEXTR:
38514 case X86ISD::BEXTRI: {
38515 SDValue Op0 = Op.getOperand(0);
38516 SDValue Op1 = Op.getOperand(1);
38517
38518 // Only bottom 16-bits of the control bits are required.
38519 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38520 // NOTE: SimplifyDemandedBits won't do this for constants.
38521 uint64_t Val1 = Cst1->getZExtValue();
38522 uint64_t MaskedVal1 = Val1 & 0xFFFF;
38523 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
38524 SDLoc DL(Op);
38525 return TLO.CombineTo(
38526 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
38527 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
38528 }
38529
38530 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38531 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38532
38533 // If the length is 0, the result is 0.
38534 if (Length == 0) {
38535 Known.setAllZero();
38536 return false;
38537 }
38538
38539 if ((Shift + Length) <= BitWidth) {
38540 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
38541 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
38542 return true;
38543
38544 Known = Known.extractBits(Length, Shift);
38545 Known = Known.zextOrTrunc(BitWidth);
38546 return false;
38547 }
38548 } else {
38549 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")((Opc == X86ISD::BEXTR && "Unexpected opcode!") ? static_cast
<void> (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38549, __PRETTY_FUNCTION__))
;
38550 KnownBits Known1;
38551 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
38552 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
38553 return true;
38554
38555 // If the length is 0, replace with 0.
38556 KnownBits LengthBits = Known1.extractBits(8, 8);
38557 if (LengthBits.isZero())
38558 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
38559 }
38560
38561 break;
38562 }
38563 case X86ISD::PDEP: {
38564 SDValue Op0 = Op.getOperand(0);
38565 SDValue Op1 = Op.getOperand(1);
38566
38567 unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
38568 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
38569
38570 // If the demanded bits has leading zeroes, we don't demand those from the
38571 // mask.
38572 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
38573 return true;
38574
38575 // The number of possible 1s in the mask determines the number of LSBs of
38576 // operand 0 used. Undemanded bits from the mask don't matter so filter
38577 // them before counting.
38578 KnownBits Known2;
38579 uint64_t Count = (~Known.Zero & LoMask).countPopulation();
38580 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
38581 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
38582 return true;
38583
38584 // Zeroes are retained from the mask, but not ones.
38585 Known.One.clearAllBits();
38586 // The result will have at least as many trailing zeros as the non-mask
38587 // operand since bits can only map to the same or higher bit position.
38588 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38589 return false;
38590 }
38591 }
38592
38593 return TargetLowering::SimplifyDemandedBitsForTargetNode(
38594 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
38595}
38596
38597SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
38598 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
38599 SelectionDAG &DAG, unsigned Depth) const {
38600 int NumElts = DemandedElts.getBitWidth();
38601 unsigned Opc = Op.getOpcode();
38602 EVT VT = Op.getValueType();
38603
38604 switch (Opc) {
38605 case X86ISD::PINSRB:
38606 case X86ISD::PINSRW: {
38607 // If we don't demand the inserted element, return the base vector.
38608 SDValue Vec = Op.getOperand(0);
38609 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
38610 MVT VecVT = Vec.getSimpleValueType();
38611 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
38612 !DemandedElts[CIdx->getZExtValue()])
38613 return Vec;
38614 break;
38615 }
38616 case X86ISD::VSHLI: {
38617 // If we are only demanding sign bits then we can use the shift source
38618 // directly.
38619 SDValue Op0 = Op.getOperand(0);
38620 unsigned ShAmt = Op.getConstantOperandVal(1);
38621 unsigned BitWidth = DemandedBits.getBitWidth();
38622 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
38623 unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
38624 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
38625 return Op0;
38626 break;
38627 }
38628 case X86ISD::VSRAI:
38629 // iff we only need the sign bit then we can use the source directly.
38630 // TODO: generalize where we only demand extended signbits.
38631 if (DemandedBits.isSignMask())
38632 return Op.getOperand(0);
38633 break;
38634 case X86ISD::PCMPGT:
38635 // icmp sgt(0, R) == ashr(R, BitWidth-1).
38636 // iff we only need the sign bit then we can use R directly.
38637 if (DemandedBits.isSignMask() &&
38638 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
38639 return Op.getOperand(1);
38640 break;
38641 }
38642
38643 APInt ShuffleUndef, ShuffleZero;
38644 SmallVector<int, 16> ShuffleMask;
38645 SmallVector<SDValue, 2> ShuffleOps;
38646 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
38647 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
38648 // If all the demanded elts are from one operand and are inline,
38649 // then we can use the operand directly.
38650 int NumOps = ShuffleOps.size();
38651 if (ShuffleMask.size() == (unsigned)NumElts &&
38652 llvm::all_of(ShuffleOps, [VT](SDValue V) {
38653 return VT.getSizeInBits() == V.getValueSizeInBits();
38654 })) {
38655
38656 if (DemandedElts.isSubsetOf(ShuffleUndef))
38657 return DAG.getUNDEF(VT);
38658 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
38659 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
38660
38661 // Bitmask that indicates which ops have only been accessed 'inline'.
38662 APInt IdentityOp = APInt::getAllOnesValue(NumOps);
38663 for (int i = 0; i != NumElts; ++i) {
38664 int M = ShuffleMask[i];
38665 if (!DemandedElts[i] || ShuffleUndef[i])
38666 continue;
38667 int OpIdx = M / NumElts;
38668 int EltIdx = M % NumElts;
38669 if (M < 0 || EltIdx != i) {
38670 IdentityOp.clearAllBits();
38671 break;
38672 }
38673 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
38674 if (IdentityOp == 0)
38675 break;
38676 }
38677 assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected") ? static_cast<void
> (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38678, __PRETTY_FUNCTION__))
38678 "Multiple identity shuffles detected")(((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected") ? static_cast<void
> (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38678, __PRETTY_FUNCTION__))
;
38679
38680 if (IdentityOp != 0)
38681 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
38682 }
38683 }
38684
38685 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
38686 Op, DemandedBits, DemandedElts, DAG, Depth);
38687}
38688
38689// Helper to peek through bitops/setcc to determine size of source vector.
38690// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
38691static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
38692 switch (Src.getOpcode()) {
38693 case ISD::SETCC:
38694 return Src.getOperand(0).getValueSizeInBits() == Size;
38695 case ISD::AND:
38696 case ISD::XOR:
38697 case ISD::OR:
38698 return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
38699 checkBitcastSrcVectorSize(Src.getOperand(1), Size);
38700 }
38701 return false;
38702}
38703
38704// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
38705static unsigned getAltBitOpcode(unsigned Opcode) {
38706 switch(Opcode) {
38707 case ISD::AND: return X86ISD::FAND;
38708 case ISD::OR: return X86ISD::FOR;
38709 case ISD::XOR: return X86ISD::FXOR;
38710 case X86ISD::ANDNP: return X86ISD::FANDN;
38711 }
38712 llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38712)
;
38713}
38714
38715// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
38716static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
38717 const SDLoc &DL) {
38718 EVT SrcVT = Src.getValueType();
38719 if (SrcVT != MVT::v4i1)
38720 return SDValue();
38721
38722 switch (Src.getOpcode()) {
38723 case ISD::SETCC:
38724 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
38725 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
38726 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
38727 SDValue Op0 = Src.getOperand(0);
38728 if (ISD::isNormalLoad(Op0.getNode()))
38729 return DAG.getBitcast(MVT::v4f32, Op0);
38730 if (Op0.getOpcode() == ISD::BITCAST &&
38731 Op0.getOperand(0).getValueType() == MVT::v4f32)
38732 return Op0.getOperand(0);
38733 }
38734 break;
38735 case ISD::AND:
38736 case ISD::XOR:
38737 case ISD::OR: {
38738 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
38739 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
38740 if (Op0 && Op1)
38741 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
38742 Op1);
38743 break;
38744 }
38745 }
38746 return SDValue();
38747}
38748
38749// Helper to push sign extension of vXi1 SETCC result through bitops.
38750static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
38751 SDValue Src, const SDLoc &DL) {
38752 switch (Src.getOpcode()) {
38753 case ISD::SETCC:
38754 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
38755 case ISD::AND:
38756 case ISD::XOR:
38757 case ISD::OR:
38758 return DAG.getNode(
38759 Src.getOpcode(), DL, SExtVT,
38760 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
38761 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
38762 }
38763 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38763)
;
38764}
38765
38766// Try to match patterns such as
38767// (i16 bitcast (v16i1 x))
38768// ->
38769// (i16 movmsk (16i8 sext (v16i1 x)))
38770// before the illegal vector is scalarized on subtargets that don't have legal
38771// vxi1 types.
38772static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
38773 const SDLoc &DL,
38774 const X86Subtarget &Subtarget) {
38775 EVT SrcVT = Src.getValueType();
38776 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
38777 return SDValue();
38778
38779 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
38780 // legalization destroys the v4i32 type.
38781 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
38782 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
38783 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
38784 DAG.getBitcast(MVT::v4f32, V));
38785 return DAG.getZExtOrTrunc(V, DL, VT);
38786 }
38787 }
38788
38789 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
38790 // movmskb even with avx512. This will be better than truncating to vXi1 and
38791 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
38792 // vpcmpeqb/vpcmpgtb.
38793 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
38794 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
38795 Src.getOperand(0).getValueType() == MVT::v32i8 ||
38796 Src.getOperand(0).getValueType() == MVT::v64i8);
38797
38798 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
38799 // directly with vpmovmskb/vmovmskps/vmovmskpd.
38800 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
38801 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
38802 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
38803 EVT CmpVT = Src.getOperand(0).getValueType();
38804 EVT EltVT = CmpVT.getVectorElementType();
38805 if (CmpVT.getSizeInBits() <= 256 &&
38806 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
38807 PreferMovMsk = true;
38808 }
38809
38810 // With AVX512 vxi1 types are legal and we prefer using k-regs.
38811 // MOVMSK is supported in SSE2 or later.
38812 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
38813 return SDValue();
38814
38815 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
38816 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
38817 // v8i16 and v16i16.
38818 // For these two cases, we can shuffle the upper element bytes to a
38819 // consecutive sequence at the start of the vector and treat the results as
38820 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
38821 // for v16i16 this is not the case, because the shuffle is expensive, so we
38822 // avoid sign-extending to this type entirely.
38823 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
38824 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
38825 MVT SExtVT;
38826 bool PropagateSExt = false;
38827 switch (SrcVT.getSimpleVT().SimpleTy) {
38828 default:
38829 return SDValue();
38830 case MVT::v2i1:
38831 SExtVT = MVT::v2i64;
38832 break;
38833 case MVT::v4i1:
38834 SExtVT = MVT::v4i32;
38835 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
38836 // sign-extend to a 256-bit operation to avoid truncation.
38837 if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
38838 SExtVT = MVT::v4i64;
38839 PropagateSExt = true;
38840 }
38841 break;
38842 case MVT::v8i1:
38843 SExtVT = MVT::v8i16;
38844 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
38845 // sign-extend to a 256-bit operation to match the compare.
38846 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
38847 // 256-bit because the shuffle is cheaper than sign extending the result of
38848 // the compare.
38849 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
38850 checkBitcastSrcVectorSize(Src, 512))) {
38851 SExtVT = MVT::v8i32;
38852 PropagateSExt = true;
38853 }
38854 break;
38855 case MVT::v16i1:
38856 SExtVT = MVT::v16i8;
38857 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
38858 // it is not profitable to sign-extend to 256-bit because this will
38859 // require an extra cross-lane shuffle which is more expensive than
38860 // truncating the result of the compare to 128-bits.
38861 break;
38862 case MVT::v32i1:
38863 SExtVT = MVT::v32i8;
38864 break;
38865 case MVT::v64i1:
38866 // If we have AVX512F, but not AVX512BW and the input is truncated from
38867 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
38868 if (Subtarget.hasAVX512()) {
38869 if (Subtarget.hasBWI())
38870 return SDValue();
38871 SExtVT = MVT::v64i8;
38872 break;
38873 }
38874 // Split if this is a <64 x i8> comparison result.
38875 if (checkBitcastSrcVectorSize(Src, 512)) {
38876 SExtVT = MVT::v64i8;
38877 break;
38878 }
38879 return SDValue();
38880 };
38881
38882 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
38883 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
38884
38885 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
38886 V = getPMOVMSKB(DL, V, DAG, Subtarget);
38887 } else {
38888 if (SExtVT == MVT::v8i16)
38889 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
38890 DAG.getUNDEF(MVT::v8i16));
38891 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
38892 }
38893
38894 EVT IntVT =
38895 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
38896 V = DAG.getZExtOrTrunc(V, DL, IntVT);
38897 return DAG.getBitcast(VT, V);
38898}
38899
38900// Convert a vXi1 constant build vector to the same width scalar integer.
38901static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
38902 EVT SrcVT = Op.getValueType();
38903 assert(SrcVT.getVectorElementType() == MVT::i1 &&((SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"
) ? static_cast<void> (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38904, __PRETTY_FUNCTION__))
38904 "Expected a vXi1 vector")((SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"
) ? static_cast<void> (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38904, __PRETTY_FUNCTION__))
;
38905 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector") ? static_cast<void>
(0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38906, __PRETTY_FUNCTION__))
38906 "Expected a constant build vector")((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector") ? static_cast<void>
(0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38906, __PRETTY_FUNCTION__))
;
38907
38908 APInt Imm(SrcVT.getVectorNumElements(), 0);
38909 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
38910 SDValue In = Op.getOperand(Idx);
38911 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
38912 Imm.setBit(Idx);
38913 }
38914 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
38915 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
38916}
38917
38918static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
38919 TargetLowering::DAGCombinerInfo &DCI,
38920 const X86Subtarget &Subtarget) {
38921 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")((N->getOpcode() == ISD::BITCAST && "Expected a bitcast"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38921, __PRETTY_FUNCTION__))
;
38922
38923 if (!DCI.isBeforeLegalizeOps())
38924 return SDValue();
38925
38926 // Only do this if we have k-registers.
38927 if (!Subtarget.hasAVX512())
38928 return SDValue();
38929
38930 EVT DstVT = N->getValueType(0);
38931 SDValue Op = N->getOperand(0);
38932 EVT SrcVT = Op.getValueType();
38933
38934 if (!Op.hasOneUse())
38935 return SDValue();
38936
38937 // Look for logic ops.
38938 if (Op.getOpcode() != ISD::AND &&
38939 Op.getOpcode() != ISD::OR &&
38940 Op.getOpcode() != ISD::XOR)
38941 return SDValue();
38942
38943 // Make sure we have a bitcast between mask registers and a scalar type.
38944 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
38945 DstVT.isScalarInteger()) &&
38946 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
38947 SrcVT.isScalarInteger()))
38948 return SDValue();
38949
38950 SDValue LHS = Op.getOperand(0);
38951 SDValue RHS = Op.getOperand(1);
38952
38953 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
38954 LHS.getOperand(0).getValueType() == DstVT)
38955 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
38956 DAG.getBitcast(DstVT, RHS));
38957
38958 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
38959 RHS.getOperand(0).getValueType() == DstVT)
38960 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
38961 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
38962
38963 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
38964 // Most of these have to move a constant from the scalar domain anyway.
38965 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
38966 RHS = combinevXi1ConstantToInteger(RHS, DAG);
38967 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
38968 DAG.getBitcast(DstVT, LHS), RHS);
38969 }
38970
38971 return SDValue();
38972}
38973
38974static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
38975 const X86Subtarget &Subtarget) {
38976 SDLoc DL(BV);
38977 unsigned NumElts = BV->getNumOperands();
38978 SDValue Splat = BV->getSplatValue();
38979
38980 // Build MMX element from integer GPR or SSE float values.
38981 auto CreateMMXElement = [&](SDValue V) {
38982 if (V.isUndef())
38983 return DAG.getUNDEF(MVT::x86mmx);
38984 if (V.getValueType().isFloatingPoint()) {
38985 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
38986 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
38987 V = DAG.getBitcast(MVT::v2i64, V);
38988 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
38989 }
38990 V = DAG.getBitcast(MVT::i32, V);
38991 } else {
38992 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
38993 }
38994 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
38995 };
38996
38997 // Convert build vector ops to MMX data in the bottom elements.
38998 SmallVector<SDValue, 8> Ops;
38999
39000 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39001
39002 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
39003 if (Splat) {
39004 if (Splat.isUndef())
39005 return DAG.getUNDEF(MVT::x86mmx);
39006
39007 Splat = CreateMMXElement(Splat);
39008
39009 if (Subtarget.hasSSE1()) {
39010 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
39011 if (NumElts == 8)
39012 Splat = DAG.getNode(
39013 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
39014 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
39015 TLI.getPointerTy(DAG.getDataLayout())),
39016 Splat, Splat);
39017
39018 // Use PSHUFW to repeat 16-bit elements.
39019 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
39020 return DAG.getNode(
39021 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
39022 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
39023 TLI.getPointerTy(DAG.getDataLayout())),
39024 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
39025 }
39026 Ops.append(NumElts, Splat);
39027 } else {
39028 for (unsigned i = 0; i != NumElts; ++i)
39029 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
39030 }
39031
39032 // Use tree of PUNPCKLs to build up general MMX vector.
39033 while (Ops.size() > 1) {
39034 unsigned NumOps = Ops.size();
39035 unsigned IntrinOp =
39036 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
39037 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
39038 : Intrinsic::x86_mmx_punpcklbw));
39039 SDValue Intrin = DAG.getTargetConstant(
39040 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
39041 for (unsigned i = 0; i != NumOps; i += 2)
39042 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
39043 Ops[i], Ops[i + 1]);
39044 Ops.resize(NumOps / 2);
39045 }
39046
39047 return Ops[0];
39048}
39049
39050// Recursive function that attempts to find if a bool vector node was originally
39051// a vector/float/double that got truncated/extended/bitcast to/from a scalar
39052// integer. If so, replace the scalar ops with bool vector equivalents back down
39053// the chain.
39054static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
39055 SelectionDAG &DAG,
39056 const X86Subtarget &Subtarget) {
39057 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39058 unsigned Opc = V.getOpcode();
39059 switch (Opc) {
39060 case ISD::BITCAST: {
39061 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
39062 SDValue Src = V.getOperand(0);
39063 EVT SrcVT = Src.getValueType();
39064 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
39065 return DAG.getBitcast(VT, Src);
39066 break;
39067 }
39068 case ISD::TRUNCATE: {
39069 // If we find a suitable source, a truncated scalar becomes a subvector.
39070 SDValue Src = V.getOperand(0);
39071 EVT NewSrcVT =
39072 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
39073 if (TLI.isTypeLegal(NewSrcVT))
39074 if (SDValue N0 =
39075 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
39076 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
39077 DAG.getIntPtrConstant(0, DL));
39078 break;
39079 }
39080 case ISD::ANY_EXTEND:
39081 case ISD::ZERO_EXTEND: {
39082 // If we find a suitable source, an extended scalar becomes a subvector.
39083 SDValue Src = V.getOperand(0);
39084 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
39085 Src.getScalarValueSizeInBits());
39086 if (TLI.isTypeLegal(NewSrcVT))
39087 if (SDValue N0 =
39088 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
39089 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
39090 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
39091 : DAG.getConstant(0, DL, VT),
39092 N0, DAG.getIntPtrConstant(0, DL));
39093 break;
39094 }
39095 case ISD::OR: {
39096 // If we find suitable sources, we can just move an OR to the vector domain.
39097 SDValue Src0 = V.getOperand(0);
39098 SDValue Src1 = V.getOperand(1);
39099 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
39100 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
39101 return DAG.getNode(Opc, DL, VT, N0, N1);
39102 break;
39103 }
39104 case ISD::SHL: {
39105 // If we find a suitable source, a SHL becomes a KSHIFTL.
39106 SDValue Src0 = V.getOperand(0);
39107 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
39108 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
39109 break;
39110
39111 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
39112 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
39113 return DAG.getNode(
39114 X86ISD::KSHIFTL, DL, VT, N0,
39115 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
39116 break;
39117 }
39118 }
39119 return SDValue();
39120}
39121
39122static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
39123 TargetLowering::DAGCombinerInfo &DCI,
39124 const X86Subtarget &Subtarget) {
39125 SDValue N0 = N->getOperand(0);
39126 EVT VT = N->getValueType(0);
39127 EVT SrcVT = N0.getValueType();
39128
39129 // Try to match patterns such as
39130 // (i16 bitcast (v16i1 x))
39131 // ->
39132 // (i16 movmsk (16i8 sext (v16i1 x)))
39133 // before the setcc result is scalarized on subtargets that don't have legal
39134 // vxi1 types.
39135 if (DCI.isBeforeLegalize()) {
39136 SDLoc dl(N);
39137 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
39138 return V;
39139
39140 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
39141 // type, widen both sides to avoid a trip through memory.
39142 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
39143 Subtarget.hasAVX512()) {
39144 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
39145 N0 = DAG.getBitcast(MVT::v8i1, N0);
39146 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
39147 DAG.getIntPtrConstant(0, dl));
39148 }
39149
39150 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
39151 // type, widen both sides to avoid a trip through memory.
39152 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
39153 Subtarget.hasAVX512()) {
39154 // Use zeros for the widening if we already have some zeroes. This can
39155 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
39156 // stream of this.
39157 // FIXME: It might make sense to detect a concat_vectors with a mix of
39158 // zeroes and undef and turn it into insert_subvector for i1 vectors as
39159 // a separate combine. What we can't do is canonicalize the operands of
39160 // such a concat or we'll get into a loop with SimplifyDemandedBits.
39161 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
39162 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
39163 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
39164 SrcVT = LastOp.getValueType();
39165 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
39166 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
39167 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
39168 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
39169 N0 = DAG.getBitcast(MVT::i8, N0);
39170 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
39171 }
39172 }
39173
39174 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
39175 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
39176 Ops[0] = N0;
39177 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
39178 N0 = DAG.getBitcast(MVT::i8, N0);
39179 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
39180 }
39181 } else {
39182 // If we're bitcasting from iX to vXi1, see if the integer originally
39183 // began as a vXi1 and whether we can remove the bitcast entirely.
39184 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
39185 SrcVT.isScalarInteger() &&
39186 DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
39187 if (SDValue V =
39188 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
39189 return V;
39190 }
39191 }
39192
39193 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
39194 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
39195 // due to insert_subvector legalization on KNL. By promoting the copy to i16
39196 // we can help with known bits propagation from the vXi1 domain to the
39197 // scalar domain.
39198 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
39199 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39200 N0.getOperand(0).getValueType() == MVT::v16i1 &&
39201 isNullConstant(N0.getOperand(1)))
39202 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
39203 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
39204
39205 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
39206 // and the vbroadcast_load are both integer or both fp. In some cases this
39207 // will remove the bitcast entirely.
39208 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
39209 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
39210 auto *BCast = cast<MemIntrinsicSDNode>(N0);
39211 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
39212 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
39213 // Don't swap i8/i16 since don't have fp types that size.
39214 if (MemSize >= 32) {
39215 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
39216 : MVT::getIntegerVT(MemSize);
39217 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
39218 : MVT::getIntegerVT(SrcVTSize);
39219 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
39220
39221 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
39222 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
39223 SDValue ResNode =
39224 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
39225 MemVT, BCast->getMemOperand());
39226 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
39227 return DAG.getBitcast(VT, ResNode);
39228 }
39229 }
39230
39231 // Since MMX types are special and don't usually play with other vector types,
39232 // it's better to handle them early to be sure we emit efficient code by
39233 // avoiding store-load conversions.
39234 if (VT == MVT::x86mmx) {
39235 // Detect MMX constant vectors.
39236 APInt UndefElts;
39237 SmallVector<APInt, 1> EltBits;
39238 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
39239 SDLoc DL(N0);
39240 // Handle zero-extension of i32 with MOVD.
39241 if (EltBits[0].countLeadingZeros() >= 32)
39242 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
39243 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
39244 // Else, bitcast to a double.
39245 // TODO - investigate supporting sext 32-bit immediates on x86_64.
39246 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
39247 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
39248 }
39249
39250 // Detect bitcasts to x86mmx low word.
39251 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
39252 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
39253 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
39254 bool LowUndef = true, AllUndefOrZero = true;
39255 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
39256 SDValue Op = N0.getOperand(i);
39257 LowUndef &= Op.isUndef() || (i >= e/2);
39258 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
39259 }
39260 if (AllUndefOrZero) {
39261 SDValue N00 = N0.getOperand(0);
39262 SDLoc dl(N00);
39263 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
39264 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
39265 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
39266 }
39267 }
39268
39269 // Detect bitcasts of 64-bit build vectors and convert to a
39270 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
39271 // lowest element.
39272 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
39273 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
39274 SrcVT == MVT::v8i8))
39275 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
39276
39277 // Detect bitcasts between element or subvector extraction to x86mmx.
39278 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
39279 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
39280 isNullConstant(N0.getOperand(1))) {
39281 SDValue N00 = N0.getOperand(0);
39282 if (N00.getValueType().is128BitVector())
39283 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
39284 DAG.getBitcast(MVT::v2i64, N00));
39285 }
39286
39287 // Detect bitcasts from FP_TO_SINT to x86mmx.
39288 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
39289 SDLoc DL(N0);
39290 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
39291 DAG.getUNDEF(MVT::v2i32));
39292 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
39293 DAG.getBitcast(MVT::v2i64, Res));
39294 }
39295 }
39296
39297 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
39298 // most of these to scalar anyway.
39299 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
39300 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
39301 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
39302 return combinevXi1ConstantToInteger(N0, DAG);
39303 }
39304
39305 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
39306 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
39307 isa<ConstantSDNode>(N0)) {
39308 auto *C = cast<ConstantSDNode>(N0);
39309 if (C->isAllOnesValue())
39310 return DAG.getConstant(1, SDLoc(N0), VT);
39311 if (C->isNullValue())
39312 return DAG.getConstant(0, SDLoc(N0), VT);
39313 }
39314
39315 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
39316 // Turn it into a sign bit compare that produces a k-register. This avoids
39317 // a trip through a GPR.
39318 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
39319 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
39320 isPowerOf2_32(VT.getVectorNumElements())) {
39321 unsigned NumElts = VT.getVectorNumElements();
39322 SDValue Src = N0;
39323
39324 // Peek through truncate.
39325 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
39326 Src = N0.getOperand(0);
39327
39328 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
39329 SDValue MovmskIn = Src.getOperand(0);
39330 MVT MovmskVT = MovmskIn.getSimpleValueType();
39331 unsigned MovMskElts = MovmskVT.getVectorNumElements();
39332
39333 // We allow extra bits of the movmsk to be used since they are known zero.
39334 // We can't convert a VPMOVMSKB without avx512bw.
39335 if (MovMskElts <= NumElts &&
39336 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
39337 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
39338 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
39339 SDLoc dl(N);
39340 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
39341 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
39342 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
39343 if (EVT(CmpVT) == VT)
39344 return Cmp;
39345
39346 // Pad with zeroes up to original VT to replace the zeroes that were
39347 // being used from the MOVMSK.
39348 unsigned NumConcats = NumElts / MovMskElts;
39349 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
39350 Ops[0] = Cmp;
39351 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
39352 }
39353 }
39354 }
39355
39356 // Try to remove bitcasts from input and output of mask arithmetic to
39357 // remove GPR<->K-register crossings.
39358 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
39359 return V;
39360
39361 // Convert a bitcasted integer logic operation that has one bitcasted
39362 // floating-point operand into a floating-point logic operation. This may
39363 // create a load of a constant, but that is cheaper than materializing the
39364 // constant in an integer register and transferring it to an SSE register or
39365 // transferring the SSE operand to integer register and back.
39366 unsigned FPOpcode;
39367 switch (N0.getOpcode()) {
39368 case ISD::AND: FPOpcode = X86ISD::FAND; break;
39369 case ISD::OR: FPOpcode = X86ISD::FOR; break;
39370 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
39371 default: return SDValue();
39372 }
39373
39374 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
39375 (Subtarget.hasSSE2() && VT == MVT::f64)))
39376 return SDValue();
39377
39378 SDValue LogicOp0 = N0.getOperand(0);
39379 SDValue LogicOp1 = N0.getOperand(1);
39380 SDLoc DL0(N0);
39381
39382 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
39383 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
39384 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
39385 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
39386 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
39387 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
39388 }
39389 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
39390 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
39391 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
39392 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
39393 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
39394 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
39395 }
39396
39397 return SDValue();
39398}
39399
39400// Given a ABS node, detect the following pattern:
39401// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
39402// This is useful as it is the input into a SAD pattern.
39403static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
39404 SDValue AbsOp1 = Abs->getOperand(0);
39405 if (AbsOp1.getOpcode() != ISD::SUB)
39406 return false;
39407
39408 Op0 = AbsOp1.getOperand(0);
39409 Op1 = AbsOp1.getOperand(1);
39410
39411 // Check if the operands of the sub are zero-extended from vectors of i8.
39412 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
39413 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
39414 Op1.getOpcode() != ISD::ZERO_EXTEND ||
39415 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
39416 return false;
39417
39418 return true;
39419}
39420
39421// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
39422// to these zexts.
39423static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
39424 const SDValue &Zext1, const SDLoc &DL,
39425 const X86Subtarget &Subtarget) {
39426 // Find the appropriate width for the PSADBW.
39427 EVT InVT = Zext0.getOperand(0).getValueType();
39428 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
39429
39430 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
39431 // fill in the missing vector elements with 0.
39432 unsigned NumConcat = RegSize / InVT.getSizeInBits();
39433 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
39434 Ops[0] = Zext0.getOperand(0);
39435 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
39436 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
39437 Ops[0] = Zext1.getOperand(0);
39438 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
39439
39440 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
39441 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39442 ArrayRef<SDValue> Ops) {
39443 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
39444 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
39445 };
39446 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
39447 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
39448 PSADBWBuilder);
39449}
39450
39451// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
39452// PHMINPOSUW.
39453static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
39454 const X86Subtarget &Subtarget) {
39455 // Bail without SSE41.
39456 if (!Subtarget.hasSSE41())
39457 return SDValue();
39458
39459 EVT ExtractVT = Extract->getValueType(0);
39460 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
39461 return SDValue();
39462
39463 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
39464 ISD::NodeType BinOp;
39465 SDValue Src = DAG.matchBinOpReduction(
39466 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
39467 if (!Src)
39468 return SDValue();
39469
39470 EVT SrcVT = Src.getValueType();
39471 EVT SrcSVT = SrcVT.getScalarType();
39472 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
39473 return SDValue();
39474
39475 SDLoc DL(Extract);
39476 SDValue MinPos = Src;
39477
39478 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
39479 while (SrcVT.getSizeInBits() > 128) {
39480 SDValue Lo, Hi;
39481 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
39482 SrcVT = Lo.getValueType();
39483 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
39484 }
39485 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39487, __PRETTY_FUNCTION__))
39486 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39487, __PRETTY_FUNCTION__))
39487 "Unexpected value type")((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39487, __PRETTY_FUNCTION__))
;
39488
39489 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
39490 // to flip the value accordingly.
39491 SDValue Mask;
39492 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
39493 if (BinOp == ISD::SMAX)
39494 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
39495 else if (BinOp == ISD::SMIN)
39496 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
39497 else if (BinOp == ISD::UMAX)
39498 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
39499
39500 if (Mask)
39501 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
39502
39503 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
39504 // shuffling each upper element down and insert zeros. This means that the
39505 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
39506 // ready for the PHMINPOS.
39507 if (ExtractVT == MVT::i8) {
39508 SDValue Upper = DAG.getVectorShuffle(
39509 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
39510 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
39511 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
39512 }
39513
39514 // Perform the PHMINPOS on a v8i16 vector,
39515 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
39516 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
39517 MinPos = DAG.getBitcast(SrcVT, MinPos);
39518
39519 if (Mask)
39520 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
39521
39522 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
39523 DAG.getIntPtrConstant(0, DL));
39524}
39525
39526// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
39527static SDValue combineHorizontalPredicateResult(SDNode *Extract,
39528 SelectionDAG &DAG,
39529 const X86Subtarget &Subtarget) {
39530 // Bail without SSE2.
39531 if (!Subtarget.hasSSE2())
39532 return SDValue();
39533
39534 EVT ExtractVT = Extract->getValueType(0);
39535 unsigned BitWidth = ExtractVT.getSizeInBits();
39536 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
39537 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
39538 return SDValue();
39539
39540 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
39541 ISD::NodeType BinOp;
39542 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
39543 if (!Match && ExtractVT == MVT::i1)
39544 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
39545 if (!Match)
39546 return SDValue();
39547
39548 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
39549 // which we can't support here for now.
39550 if (Match.getScalarValueSizeInBits() != BitWidth)
39551 return SDValue();
39552
39553 SDValue Movmsk;
39554 SDLoc DL(Extract);
39555 EVT MatchVT = Match.getValueType();
39556 unsigned NumElts = MatchVT.getVectorNumElements();
39557 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
39558 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39559
39560 if (ExtractVT == MVT::i1) {
39561 // Special case for (pre-legalization) vXi1 reductions.
39562 if (NumElts > 64 || !isPowerOf2_32(NumElts))
39563 return SDValue();
39564 if (TLI.isTypeLegal(MatchVT)) {
39565 // If this is a legal AVX512 predicate type then we can just bitcast.
39566 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
39567 Movmsk = DAG.getBitcast(MovmskVT, Match);
39568 } else {
39569 // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
39570 // PCMPEQQ (SSE41+), use PCMPEQD instead.
39571 if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
39572 Match.getOpcode() == ISD::SETCC &&
39573 ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
39574 cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
39575 ISD::CondCode::SETEQ) {
39576 SDValue Vec = Match.getOperand(0);
39577 if (Vec.getValueType().getScalarType() == MVT::i64 &&
39578 (2 * NumElts) <= MaxElts) {
39579 NumElts *= 2;
39580 EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
39581 MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
39582 Match = DAG.getSetCC(
39583 DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
39584 DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
39585 }
39586 }
39587
39588 // Use combineBitcastvxi1 to create the MOVMSK.
39589 while (NumElts > MaxElts) {
39590 SDValue Lo, Hi;
39591 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
39592 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
39593 NumElts /= 2;
39594 }
39595 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
39596 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
39597 }
39598 if (!Movmsk)
39599 return SDValue();
39600 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
39601 } else {
39602 // FIXME: Better handling of k-registers or 512-bit vectors?
39603 unsigned MatchSizeInBits = Match.getValueSizeInBits();
39604 if (!(MatchSizeInBits == 128 ||
39605 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
39606 return SDValue();
39607
39608 // Make sure this isn't a vector of 1 element. The perf win from using
39609 // MOVMSK diminishes with less elements in the reduction, but it is
39610 // generally better to get the comparison over to the GPRs as soon as
39611 // possible to reduce the number of vector ops.
39612 if (Match.getValueType().getVectorNumElements() < 2)
39613 return SDValue();
39614
39615 // Check that we are extracting a reduction of all sign bits.
39616 if (DAG.ComputeNumSignBits(Match) != BitWidth)
39617 return SDValue();
39618
39619 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
39620 SDValue Lo, Hi;
39621 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
39622 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
39623 MatchSizeInBits = Match.getValueSizeInBits();
39624 }
39625
39626 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
39627 MVT MaskSrcVT;
39628 if (64 == BitWidth || 32 == BitWidth)
39629 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
39630 MatchSizeInBits / BitWidth);
39631 else
39632 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
39633
39634 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
39635 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
39636 NumElts = MaskSrcVT.getVectorNumElements();
39637 }
39638 assert((NumElts <= 32 || NumElts == 64) &&(((NumElts <= 32 || NumElts == 64) && "Not expecting more than 64 elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39639, __PRETTY_FUNCTION__))
39639 "Not expecting more than 64 elements")(((NumElts <= 32 || NumElts == 64) && "Not expecting more than 64 elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39639, __PRETTY_FUNCTION__))
;
39640
39641 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
39642 if (BinOp == ISD::XOR) {
39643 // parity -> (PARITY(MOVMSK X))
39644 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
39645 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
39646 }
39647
39648 SDValue CmpC;
39649 ISD::CondCode CondCode;
39650 if (BinOp == ISD::OR) {
39651 // any_of -> MOVMSK != 0
39652 CmpC = DAG.getConstant(0, DL, CmpVT);
39653 CondCode = ISD::CondCode::SETNE;
39654 } else {
39655 // all_of -> MOVMSK == ((1 << NumElts) - 1)
39656 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
39657 DL, CmpVT);
39658 CondCode = ISD::CondCode::SETEQ;
39659 }
39660
39661 // The setcc produces an i8 of 0/1, so extend that to the result width and
39662 // negate to get the final 0/-1 mask value.
39663 EVT SetccVT =
39664 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
39665 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
39666 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
39667 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
39668 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
39669}
39670
39671static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
39672 const X86Subtarget &Subtarget) {
39673 // PSADBW is only supported on SSE2 and up.
39674 if (!Subtarget.hasSSE2())
39675 return SDValue();
39676
39677 EVT ExtractVT = Extract->getValueType(0);
39678 // Verify the type we're extracting is either i32 or i64.
39679 // FIXME: Could support other types, but this is what we have coverage for.
39680 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
39681 return SDValue();
39682
39683 EVT VT = Extract->getOperand(0).getValueType();
39684 if (!isPowerOf2_32(VT.getVectorNumElements()))
39685 return SDValue();
39686
39687 // Match shuffle + add pyramid.
39688 ISD::NodeType BinOp;
39689 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
39690
39691 // The operand is expected to be zero extended from i8
39692 // (verified in detectZextAbsDiff).
39693 // In order to convert to i64 and above, additional any/zero/sign
39694 // extend is expected.
39695 // The zero extend from 32 bit has no mathematical effect on the result.
39696 // Also the sign extend is basically zero extend
39697 // (extends the sign bit which is zero).
39698 // So it is correct to skip the sign/zero extend instruction.
39699 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
39700 Root.getOpcode() == ISD::ZERO_EXTEND ||
39701 Root.getOpcode() == ISD::ANY_EXTEND))
39702 Root = Root.getOperand(0);
39703
39704 // If there was a match, we want Root to be a select that is the root of an
39705 // abs-diff pattern.
39706 if (!Root || Root.getOpcode() != ISD::ABS)
39707 return SDValue();
39708
39709 // Check whether we have an abs-diff pattern feeding into the select.
39710 SDValue Zext0, Zext1;
39711 if (!detectZextAbsDiff(Root, Zext0, Zext1))
39712 return SDValue();
39713
39714 // Create the SAD instruction.
39715 SDLoc DL(Extract);
39716 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
39717
39718 // If the original vector was wider than 8 elements, sum over the results
39719 // in the SAD vector.
39720 unsigned Stages = Log2_32(VT.getVectorNumElements());
39721 EVT SadVT = SAD.getValueType();
39722 if (Stages > 3) {
39723 unsigned SadElems = SadVT.getVectorNumElements();
39724
39725 for(unsigned i = Stages - 3; i > 0; --i) {
39726 SmallVector<int, 16> Mask(SadElems, -1);
39727 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
39728 Mask[j] = MaskEnd + j;
39729
39730 SDValue Shuffle =
39731 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
39732 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
39733 }
39734 }
39735
39736 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
39737 // Return the lowest ExtractSizeInBits bits.
39738 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
39739 SadVT.getSizeInBits() / ExtractSizeInBits);
39740 SAD = DAG.getBitcast(ResVT, SAD);
39741 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
39742 Extract->getOperand(1));
39743}
39744
39745// Attempt to peek through a target shuffle and extract the scalar from the
39746// source.
39747static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
39748 TargetLowering::DAGCombinerInfo &DCI,
39749 const X86Subtarget &Subtarget) {
39750 if (DCI.isBeforeLegalizeOps())
39751 return SDValue();
39752
39753 SDLoc dl(N);
39754 SDValue Src = N->getOperand(0);
39755 SDValue Idx = N->getOperand(1);
39756
39757 EVT VT = N->getValueType(0);
39758 EVT SrcVT = Src.getValueType();
39759 EVT SrcSVT = SrcVT.getVectorElementType();
39760 unsigned SrcEltBits = SrcSVT.getSizeInBits();
39761 unsigned NumSrcElts = SrcVT.getVectorNumElements();
39762
39763 // Don't attempt this for boolean mask vectors or unknown extraction indices.
39764 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
39765 return SDValue();
39766
39767 const APInt &IdxC = N->getConstantOperandAPInt(1);
39768 if (IdxC.uge(NumSrcElts))
39769 return SDValue();
39770
39771 SDValue SrcBC = peekThroughBitcasts(Src);
39772
39773 // Handle extract(bitcast(broadcast(scalar_value))).
39774 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
39775 SDValue SrcOp = SrcBC.getOperand(0);
39776 EVT SrcOpVT = SrcOp.getValueType();
39777 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
39778 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
39779 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
39780 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
39781 // TODO support non-zero offsets.
39782 if (Offset == 0) {
39783 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
39784 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
39785 return SrcOp;
39786 }
39787 }
39788 }
39789
39790 // If we're extracting a single element from a broadcast load and there are
39791 // no other users, just create a single load.
39792 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
39793 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
39794 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
39795 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
39796 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
39797 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
39798 MemIntr->getBasePtr(),
39799 MemIntr->getPointerInfo(),
39800 MemIntr->getOriginalAlign(),
39801 MemIntr->getMemOperand()->getFlags());
39802 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
39803 return Load;
39804 }
39805 }
39806
39807 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
39808 // TODO: Move to DAGCombine?
39809 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
39810 SrcBC.getValueType().isInteger() &&
39811 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
39812 SrcBC.getScalarValueSizeInBits() ==
39813 SrcBC.getOperand(0).getValueSizeInBits()) {
39814 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
39815 if (IdxC.ult(Scale)) {
39816 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
39817 SDValue Scl = SrcBC.getOperand(0);
39818 EVT SclVT = Scl.getValueType();
39819 if (Offset) {
39820 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
39821 DAG.getShiftAmountConstant(Offset, SclVT, dl));
39822 }
39823 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
39824 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
39825 return Scl;
39826 }
39827 }
39828
39829 // Handle extract(truncate(x)) for 0'th index.
39830 // TODO: Treat this as a faux shuffle?
39831 // TODO: When can we use this for general indices?
39832 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
39833 (SrcVT.getSizeInBits() % 128) == 0) {
39834 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
39835 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
39836 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
39837 Idx);
39838 }
39839
39840 // Resolve the target shuffle inputs and mask.
39841 SmallVector<int, 16> Mask;
39842 SmallVector<SDValue, 2> Ops;
39843 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
39844 return SDValue();
39845
39846 // Shuffle inputs must be the same size as the result.
39847 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
39848 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
39849 }))
39850 return SDValue();
39851
39852 // Attempt to narrow/widen the shuffle mask to the correct size.
39853 if (Mask.size() != NumSrcElts) {
39854 if ((NumSrcElts % Mask.size()) == 0) {
39855 SmallVector<int, 16> ScaledMask;
39856 int Scale = NumSrcElts / Mask.size();
39857 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
39858 Mask = std::move(ScaledMask);
39859 } else if ((Mask.size() % NumSrcElts) == 0) {
39860 // Simplify Mask based on demanded element.
39861 int ExtractIdx = (int)N->getConstantOperandVal(1);
39862 int Scale = Mask.size() / NumSrcElts;
39863 int Lo = Scale * ExtractIdx;
39864 int Hi = Scale * (ExtractIdx + 1);
39865 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
39866 if (i < Lo || Hi <= i)
39867 Mask[i] = SM_SentinelUndef;
39868
39869 SmallVector<int, 16> WidenedMask;
39870 while (Mask.size() > NumSrcElts &&
39871 canWidenShuffleElements(Mask, WidenedMask))
39872 Mask = std::move(WidenedMask);
39873 // TODO - investigate support for wider shuffle masks with known upper
39874 // undef/zero elements for implicit zero-extension.
39875 }
39876 }
39877
39878 // Check if narrowing/widening failed.
39879 if (Mask.size() != NumSrcElts)
39880 return SDValue();
39881
39882 int SrcIdx = Mask[IdxC.getZExtValue()];
39883
39884 // If the shuffle source element is undef/zero then we can just accept it.
39885 if (SrcIdx == SM_SentinelUndef)
39886 return DAG.getUNDEF(VT);
39887
39888 if (SrcIdx == SM_SentinelZero)
39889 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
39890 : DAG.getConstant(0, dl, VT);
39891
39892 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
39893 SrcIdx = SrcIdx % Mask.size();
39894
39895 // We can only extract other elements from 128-bit vectors and in certain
39896 // circumstances, depending on SSE-level.
39897 // TODO: Investigate using extract_subvector for larger vectors.
39898 // TODO: Investigate float/double extraction if it will be just stored.
39899 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
39900 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
39901 assert(SrcSVT == VT && "Unexpected extraction type")((SrcSVT == VT && "Unexpected extraction type") ? static_cast
<void> (0) : __assert_fail ("SrcSVT == VT && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39901, __PRETTY_FUNCTION__))
;
39902 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
39903 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
39904 DAG.getIntPtrConstant(SrcIdx, dl));
39905 }
39906
39907 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
39908 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
39909 assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type")((VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= SrcEltBits && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39909, __PRETTY_FUNCTION__))
;
39910 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
39911 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
39912 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
39913 DAG.getTargetConstant(SrcIdx, dl, MVT::i8));
39914 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
39915 }
39916
39917 return SDValue();
39918}
39919
39920/// Extracting a scalar FP value from vector element 0 is free, so extract each
39921/// operand first, then perform the math as a scalar op.
39922static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
39923 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")((ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
"Expected extract") ? static_cast<void> (0) : __assert_fail
("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39923, __PRETTY_FUNCTION__))
;
39924 SDValue Vec = ExtElt->getOperand(0);
39925 SDValue Index = ExtElt->getOperand(1);
39926 EVT VT = ExtElt->getValueType(0);
39927 EVT VecVT = Vec.getValueType();
39928
39929 // TODO: If this is a unary/expensive/expand op, allow extraction from a
39930 // non-zero element because the shuffle+scalar op will be cheaper?
39931 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
39932 return SDValue();
39933
39934 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
39935 // extract, the condition code), so deal with those as a special-case.
39936 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
39937 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
39938 if (OpVT != MVT::f32 && OpVT != MVT::f64)
39939 return SDValue();
39940
39941 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
39942 SDLoc DL(ExtElt);
39943 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
39944 Vec.getOperand(0), Index);
39945 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
39946 Vec.getOperand(1), Index);
39947 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
39948 }
39949
39950 if (VT != MVT::f32 && VT != MVT::f64)
39951 return SDValue();
39952
39953 // Vector FP selects don't fit the pattern of FP math ops (because the
39954 // condition has a different type and we have to change the opcode), so deal
39955 // with those here.
39956 // FIXME: This is restricted to pre type legalization by ensuring the setcc
39957 // has i1 elements. If we loosen this we need to convert vector bool to a
39958 // scalar bool.
39959 if (Vec.getOpcode() == ISD::VSELECT &&
39960 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
39961 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
39962 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
39963 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
39964 SDLoc DL(ExtElt);
39965 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
39966 Vec.getOperand(0).getValueType().getScalarType(),
39967 Vec.getOperand(0), Index);
39968 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
39969 Vec.getOperand(1), Index);
39970 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
39971 Vec.getOperand(2), Index);
39972 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
39973 }
39974
39975 // TODO: This switch could include FNEG and the x86-specific FP logic ops
39976 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
39977 // missed load folding and fma+fneg combining.
39978 switch (Vec.getOpcode()) {
39979 case ISD::FMA: // Begin 3 operands
39980 case ISD::FMAD:
39981 case ISD::FADD: // Begin 2 operands
39982 case ISD::FSUB:
39983 case ISD::FMUL:
39984 case ISD::FDIV:
39985 case ISD::FREM:
39986 case ISD::FCOPYSIGN:
39987 case ISD::FMINNUM:
39988 case ISD::FMAXNUM:
39989 case ISD::FMINNUM_IEEE:
39990 case ISD::FMAXNUM_IEEE:
39991 case ISD::FMAXIMUM:
39992 case ISD::FMINIMUM:
39993 case X86ISD::FMAX:
39994 case X86ISD::FMIN:
39995 case ISD::FABS: // Begin 1 operand
39996 case ISD::FSQRT:
39997 case ISD::FRINT:
39998 case ISD::FCEIL:
39999 case ISD::FTRUNC:
40000 case ISD::FNEARBYINT:
40001 case ISD::FROUND:
40002 case ISD::FFLOOR:
40003 case X86ISD::FRCP:
40004 case X86ISD::FRSQRT: {
40005 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
40006 SDLoc DL(ExtElt);
40007 SmallVector<SDValue, 4> ExtOps;
40008 for (SDValue Op : Vec->ops())
40009 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
40010 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
40011 }
40012 default:
40013 return SDValue();
40014 }
40015 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40015)
;
40016}
40017
40018/// Try to convert a vector reduction sequence composed of binops and shuffles
40019/// into horizontal ops.
40020static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
40021 const X86Subtarget &Subtarget) {
40022 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")((ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
"Unexpected caller") ? static_cast<void> (0) : __assert_fail
("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40022, __PRETTY_FUNCTION__))
;
40023
40024 // We need at least SSE2 to anything here.
40025 if (!Subtarget.hasSSE2())
40026 return SDValue();
40027
40028 ISD::NodeType Opc;
40029 SDValue Rdx =
40030 DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
40031 if (!Rdx)
40032 return SDValue();
40033
40034 SDValue Index = ExtElt->getOperand(1);
40035 assert(isNullConstant(Index) &&((isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? static_cast<void> (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40036, __PRETTY_FUNCTION__))
40036 "Reduction doesn't end in an extract from index 0")((isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? static_cast<void> (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40036, __PRETTY_FUNCTION__))
;
40037
40038 EVT VT = ExtElt->getValueType(0);
40039 EVT VecVT = Rdx.getValueType();
40040 if (VecVT.getScalarType() != VT)
40041 return SDValue();
40042
40043 SDLoc DL(ExtElt);
40044
40045 // vXi8 reduction - sub 128-bit vector.
40046 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
40047 if (VecVT == MVT::v4i8) {
40048 // Pad with zero.
40049 if (Subtarget.hasSSE41()) {
40050 Rdx = DAG.getBitcast(MVT::i32, Rdx);
40051 Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
40052 DAG.getConstant(0, DL, MVT::v4i32), Rdx,
40053 DAG.getIntPtrConstant(0, DL));
40054 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
40055 } else {
40056 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
40057 DAG.getConstant(0, DL, VecVT));
40058 }
40059 }
40060 if (Rdx.getValueType() == MVT::v8i8) {
40061 // Pad with undef.
40062 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
40063 DAG.getUNDEF(MVT::v8i8));
40064 }
40065 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
40066 DAG.getConstant(0, DL, MVT::v16i8));
40067 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
40068 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
40069 }
40070
40071 // Must be a >=128-bit vector with pow2 elements.
40072 if ((VecVT.getSizeInBits() % 128) != 0 ||
40073 !isPowerOf2_32(VecVT.getVectorNumElements()))
40074 return SDValue();
40075
40076 // vXi8 reduction - sum lo/hi halves then use PSADBW.
40077 if (VT == MVT::i8) {
40078 while (Rdx.getValueSizeInBits() > 128) {
40079 SDValue Lo, Hi;
40080 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
40081 VecVT = Lo.getValueType();
40082 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
40083 }
40084 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")((VecVT == MVT::v16i8 && "v16i8 reduction expected") ?
static_cast<void> (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40084, __PRETTY_FUNCTION__))
;
40085
40086 SDValue Hi = DAG.getVectorShuffle(
40087 MVT::v16i8, DL, Rdx, Rdx,
40088 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
40089 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
40090 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
40091 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
40092 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
40093 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
40094 }
40095
40096 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
40097 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
40098 return SDValue();
40099
40100 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
40101
40102 // 256-bit horizontal instructions operate on 128-bit chunks rather than
40103 // across the whole vector, so we need an extract + hop preliminary stage.
40104 // This is the only step where the operands of the hop are not the same value.
40105 // TODO: We could extend this to handle 512-bit or even longer vectors.
40106 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
40107 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
40108 unsigned NumElts = VecVT.getVectorNumElements();
40109 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
40110 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
40111 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
40112 VecVT = Rdx.getValueType();
40113 }
40114 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
40115 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
40116 return SDValue();
40117
40118 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
40119 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
40120 for (unsigned i = 0; i != ReductionSteps; ++i)
40121 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
40122
40123 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
40124}
40125
40126/// Detect vector gather/scatter index generation and convert it from being a
40127/// bunch of shuffles and extracts into a somewhat faster sequence.
40128/// For i686, the best sequence is apparently storing the value and loading
40129/// scalars back, while for x64 we should use 64-bit extracts and shifts.
40130static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
40131 TargetLowering::DAGCombinerInfo &DCI,
40132 const X86Subtarget &Subtarget) {
40133 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
40134 return NewOp;
40135
40136 SDValue InputVector = N->getOperand(0);
40137 SDValue EltIdx = N->getOperand(1);
40138 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
40139
40140 EVT SrcVT = InputVector.getValueType();
40141 EVT VT = N->getValueType(0);
40142 SDLoc dl(InputVector);
40143 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
40144 unsigned NumSrcElts = SrcVT.getVectorNumElements();
40145
40146 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
40147 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
40148
40149 // Integer Constant Folding.
40150 if (CIdx && VT.isInteger()) {
40151 APInt UndefVecElts;
40152 SmallVector<APInt, 16> EltBits;
40153 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
40154 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
40155 EltBits, true, false)) {
40156 uint64_t Idx = CIdx->getZExtValue();
40157 if (UndefVecElts[Idx])
40158 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
40159 return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
40160 dl, VT);
40161 }
40162 }
40163
40164 if (IsPextr) {
40165 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40166 if (TLI.SimplifyDemandedBits(
40167 SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
40168 return SDValue(N, 0);
40169
40170 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
40171 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
40172 InputVector.getOpcode() == X86ISD::PINSRW) &&
40173 InputVector.getOperand(2) == EltIdx) {
40174 assert(SrcVT == InputVector.getOperand(0).getValueType() &&((SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40175, __PRETTY_FUNCTION__))
40175 "Vector type mismatch")((SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40175, __PRETTY_FUNCTION__))
;
40176 SDValue Scl = InputVector.getOperand(1);
40177 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
40178 return DAG.getZExtOrTrunc(Scl, dl, VT);
40179 }
40180
40181 // TODO - Remove this once we can handle the implicit zero-extension of
40182 // X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
40183 // combineBasicSADPattern.
40184 return SDValue();
40185 }
40186
40187 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
40188 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
40189 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
40190 SDValue MMXSrc = InputVector.getOperand(0);
40191
40192 // The bitcast source is a direct mmx result.
40193 if (MMXSrc.getValueType() == MVT::x86mmx)
40194 return DAG.getBitcast(VT, InputVector);
40195 }
40196
40197 // Detect mmx to i32 conversion through a v2i32 elt extract.
40198 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
40199 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
40200 SDValue MMXSrc = InputVector.getOperand(0);
40201
40202 // The bitcast source is a direct mmx result.
40203 if (MMXSrc.getValueType() == MVT::x86mmx)
40204 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
40205 }
40206
40207 // Check whether this extract is the root of a sum of absolute differences
40208 // pattern. This has to be done here because we really want it to happen
40209 // pre-legalization,
40210 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
40211 return SAD;
40212
40213 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
40214 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
40215 return Cmp;
40216
40217 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
40218 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
40219 return MinMax;
40220
40221 if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
40222 return V;
40223
40224 if (SDValue V = scalarizeExtEltFP(N, DAG))
40225 return V;
40226
40227 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
40228 // and then testing the relevant element.
40229 //
40230 // Note that we only combine extracts on the *same* result number, i.e.
40231 // t0 = merge_values a0, a1, a2, a3
40232 // i1 = extract_vector_elt t0, Constant:i64<2>
40233 // i1 = extract_vector_elt t0, Constant:i64<3>
40234 // but not
40235 // i1 = extract_vector_elt t0:1, Constant:i64<2>
40236 // since the latter would need its own MOVMSK.
40237 if (CIdx && SrcVT.getScalarType() == MVT::i1) {
40238 SmallVector<SDNode *, 16> BoolExtracts;
40239 unsigned ResNo = InputVector.getResNo();
40240 auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
40241 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
40242 isa<ConstantSDNode>(Use->getOperand(1)) &&
40243 Use->getOperand(0).getResNo() == ResNo &&
40244 Use->getValueType(0) == MVT::i1) {
40245 BoolExtracts.push_back(Use);
40246 return true;
40247 }
40248 return false;
40249 };
40250 if (all_of(InputVector->uses(), IsBoolExtract) &&
40251 BoolExtracts.size() > 1) {
40252 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
40253 if (SDValue BC =
40254 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
40255 for (SDNode *Use : BoolExtracts) {
40256 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
40257 unsigned MaskIdx = Use->getConstantOperandVal(1);
40258 APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
40259 SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
40260 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
40261 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
40262 DCI.CombineTo(Use, Res);
40263 }
40264 return SDValue(N, 0);
40265 }
40266 }
40267 }
40268
40269 return SDValue();
40270}
40271
40272/// If a vector select has an operand that is -1 or 0, try to simplify the
40273/// select to a bitwise logic operation.
40274/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
40275static SDValue
40276combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
40277 TargetLowering::DAGCombinerInfo &DCI,
40278 const X86Subtarget &Subtarget) {
40279 SDValue Cond = N->getOperand(0);
40280 SDValue LHS = N->getOperand(1);
40281 SDValue RHS = N->getOperand(2);
40282 EVT VT = LHS.getValueType();
40283 EVT CondVT = Cond.getValueType();
40284 SDLoc DL(N);
40285 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40286
40287 if (N->getOpcode() != ISD::VSELECT)
40288 return SDValue();
40289
40290 assert(CondVT.isVector() && "Vector select expects a vector selector!")((CondVT.isVector() && "Vector select expects a vector selector!"
) ? static_cast<void> (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40290, __PRETTY_FUNCTION__))
;
40291
40292 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
40293 // TODO: Can we assert that both operands are not zeros (because that should
40294 // get simplified at node creation time)?
40295 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
40296 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
40297
40298 // If both inputs are 0/undef, create a complete zero vector.
40299 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
40300 if (TValIsAllZeros && FValIsAllZeros) {
40301 if (VT.isFloatingPoint())
40302 return DAG.getConstantFP(0.0, DL, VT);
40303 return DAG.getConstant(0, DL, VT);
40304 }
40305
40306 // To use the condition operand as a bitwise mask, it must have elements that
40307 // are the same size as the select elements. Ie, the condition operand must
40308 // have already been promoted from the IR select condition type <N x i1>.
40309 // Don't check if the types themselves are equal because that excludes
40310 // vector floating-point selects.
40311 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
40312 return SDValue();
40313
40314 // Try to invert the condition if true value is not all 1s and false value is
40315 // not all 0s. Only do this if the condition has one use.
40316 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
40317 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
40318 // Check if the selector will be produced by CMPP*/PCMP*.
40319 Cond.getOpcode() == ISD::SETCC &&
40320 // Check if SETCC has already been promoted.
40321 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
40322 CondVT) {
40323 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
40324
40325 if (TValIsAllZeros || FValIsAllOnes) {
40326 SDValue CC = Cond.getOperand(2);
40327 ISD::CondCode NewCC = ISD::getSetCCInverse(
40328 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
40329 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
40330 NewCC);
40331 std::swap(LHS, RHS);
40332 TValIsAllOnes = FValIsAllOnes;
40333 FValIsAllZeros = TValIsAllZeros;
40334 }
40335 }
40336
40337 // Cond value must be 'sign splat' to be converted to a logical op.
40338 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
40339 return SDValue();
40340
40341 // vselect Cond, 111..., 000... -> Cond
40342 if (TValIsAllOnes && FValIsAllZeros)
40343 return DAG.getBitcast(VT, Cond);
40344
40345 if (!TLI.isTypeLegal(CondVT))
40346 return SDValue();
40347
40348 // vselect Cond, 111..., X -> or Cond, X
40349 if (TValIsAllOnes) {
40350 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
40351 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
40352 return DAG.getBitcast(VT, Or);
40353 }
40354
40355 // vselect Cond, X, 000... -> and Cond, X
40356 if (FValIsAllZeros) {
40357 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
40358 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
40359 return DAG.getBitcast(VT, And);
40360 }
40361
40362 // vselect Cond, 000..., X -> andn Cond, X
40363 if (TValIsAllZeros) {
40364 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
40365 SDValue AndN;
40366 // The canonical form differs for i1 vectors - x86andnp is not used
40367 if (CondVT.getScalarType() == MVT::i1)
40368 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
40369 CastRHS);
40370 else
40371 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
40372 return DAG.getBitcast(VT, AndN);
40373 }
40374
40375 return SDValue();
40376}
40377
40378/// If both arms of a vector select are concatenated vectors, split the select,
40379/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
40380/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
40381/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
40382static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
40383 const X86Subtarget &Subtarget) {
40384 unsigned Opcode = N->getOpcode();
40385 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
40386 return SDValue();
40387
40388 // TODO: Split 512-bit vectors too?
40389 EVT VT = N->getValueType(0);
40390 if (!VT.is256BitVector())
40391 return SDValue();
40392
40393 // TODO: Split as long as any 2 of the 3 operands are concatenated?
40394 SDValue Cond = N->getOperand(0);
40395 SDValue TVal = N->getOperand(1);
40396 SDValue FVal = N->getOperand(2);
40397 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
40398 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
40399 !collectConcatOps(TVal.getNode(), CatOpsT) ||
40400 !collectConcatOps(FVal.getNode(), CatOpsF))
40401 return SDValue();
40402
40403 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
40404 ArrayRef<SDValue> Ops) {
40405 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
40406 };
40407 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
40408 makeBlend, /*CheckBWI*/ false);
40409}
40410
40411static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
40412 SDValue Cond = N->getOperand(0);
40413 SDValue LHS = N->getOperand(1);
40414 SDValue RHS = N->getOperand(2);
40415 SDLoc DL(N);
40416
40417 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
40418 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
40419 if (!TrueC || !FalseC)
40420 return SDValue();
40421
40422 // Don't do this for crazy integer types.
40423 EVT VT = N->getValueType(0);
40424 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
40425 return SDValue();
40426
40427 // We're going to use the condition bit in math or logic ops. We could allow
40428 // this with a wider condition value (post-legalization it becomes an i8),
40429 // but if nothing is creating selects that late, it doesn't matter.
40430 if (Cond.getValueType() != MVT::i1)
40431 return SDValue();
40432
40433 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
40434 // 3, 5, or 9 with i32/i64, so those get transformed too.
40435 // TODO: For constants that overflow or do not differ by power-of-2 or small
40436 // multiplier, convert to 'and' + 'add'.
40437 const APInt &TrueVal = TrueC->getAPIntValue();
40438 const APInt &FalseVal = FalseC->getAPIntValue();
40439 bool OV;
40440 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
40441 if (OV)
40442 return SDValue();
40443
40444 APInt AbsDiff = Diff.abs();
40445 if (AbsDiff.isPowerOf2() ||
40446 ((VT == MVT::i32 || VT == MVT::i64) &&
40447 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
40448
40449 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
40450 // of the condition can usually be folded into a compare predicate, but even
40451 // without that, the sequence should be cheaper than a CMOV alternative.
40452 if (TrueVal.slt(FalseVal)) {
40453 Cond = DAG.getNOT(DL, Cond, MVT::i1);
40454 std::swap(TrueC, FalseC);
40455 }
40456
40457 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
40458 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
40459
40460 // Multiply condition by the difference if non-one.
40461 if (!AbsDiff.isOneValue())
40462 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
40463
40464 // Add the base if non-zero.
40465 if (!FalseC->isNullValue())
40466 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
40467
40468 return R;
40469 }
40470
40471 return SDValue();
40472}
40473
40474/// If this is a *dynamic* select (non-constant condition) and we can match
40475/// this node with one of the variable blend instructions, restructure the
40476/// condition so that blends can use the high (sign) bit of each element.
40477/// This function will also call SimplifyDemandedBits on already created
40478/// BLENDV to perform additional simplifications.
40479static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
40480 TargetLowering::DAGCombinerInfo &DCI,
40481 const X86Subtarget &Subtarget) {
40482 SDValue Cond = N->getOperand(0);
40483 if ((N->getOpcode() != ISD::VSELECT &&
40484 N->getOpcode() != X86ISD::BLENDV) ||
40485 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
40486 return SDValue();
40487
40488 // Don't optimize before the condition has been transformed to a legal type
40489 // and don't ever optimize vector selects that map to AVX512 mask-registers.
40490 unsigned BitWidth = Cond.getScalarValueSizeInBits();
40491 if (BitWidth < 8 || BitWidth > 64)
40492 return SDValue();
40493
40494 // We can only handle the cases where VSELECT is directly legal on the
40495 // subtarget. We custom lower VSELECT nodes with constant conditions and
40496 // this makes it hard to see whether a dynamic VSELECT will correctly
40497 // lower, so we both check the operation's status and explicitly handle the
40498 // cases where a *dynamic* blend will fail even though a constant-condition
40499 // blend could be custom lowered.
40500 // FIXME: We should find a better way to handle this class of problems.
40501 // Potentially, we should combine constant-condition vselect nodes
40502 // pre-legalization into shuffles and not mark as many types as custom
40503 // lowered.
40504 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40505 EVT VT = N->getValueType(0);
40506 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
40507 return SDValue();
40508 // FIXME: We don't support i16-element blends currently. We could and
40509 // should support them by making *all* the bits in the condition be set
40510 // rather than just the high bit and using an i8-element blend.
40511 if (VT.getVectorElementType() == MVT::i16)
40512 return SDValue();
40513 // Dynamic blending was only available from SSE4.1 onward.
40514 if (VT.is128BitVector() && !Subtarget.hasSSE41())
40515 return SDValue();
40516 // Byte blends are only available in AVX2
40517 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
40518 return SDValue();
40519 // There are no 512-bit blend instructions that use sign bits.
40520 if (VT.is512BitVector())
40521 return SDValue();
40522
40523 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
40524 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
40525 UI != UE; ++UI)
40526 if ((UI->getOpcode() != ISD::VSELECT &&
40527 UI->getOpcode() != X86ISD::BLENDV) ||
40528 UI.getOperandNo() != 0)
40529 return false;
40530
40531 return true;
40532 };
40533
40534 APInt DemandedBits(APInt::getSignMask(BitWidth));
40535
40536 if (OnlyUsedAsSelectCond(Cond)) {
40537 KnownBits Known;
40538 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
40539 !DCI.isBeforeLegalizeOps());
40540 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
40541 return SDValue();
40542
40543 // If we changed the computation somewhere in the DAG, this change will
40544 // affect all users of Cond. Update all the nodes so that we do not use
40545 // the generic VSELECT anymore. Otherwise, we may perform wrong
40546 // optimizations as we messed with the actual expectation for the vector
40547 // boolean values.
40548 for (SDNode *U : Cond->uses()) {
40549 if (U->getOpcode() == X86ISD::BLENDV)
40550 continue;
40551
40552 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
40553 Cond, U->getOperand(1), U->getOperand(2));
40554 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
40555 DCI.AddToWorklist(U);
40556 }
40557 DCI.CommitTargetLoweringOpt(TLO);
40558 return SDValue(N, 0);
40559 }
40560
40561 // Otherwise we can still at least try to simplify multiple use bits.
40562 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
40563 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
40564 N->getOperand(1), N->getOperand(2));
40565
40566 return SDValue();
40567}
40568
40569// Try to match:
40570// (or (and (M, (sub 0, X)), (pandn M, X)))
40571// which is a special case of:
40572// (select M, (sub 0, X), X)
40573// Per:
40574// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
40575// We know that, if fNegate is 0 or 1:
40576// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
40577//
40578// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
40579// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
40580// ( M ? -X : X) == ((X ^ M ) + (M & 1))
40581// This lets us transform our vselect to:
40582// (add (xor X, M), (and M, 1))
40583// And further to:
40584// (sub (xor X, M), M)
40585static SDValue combineLogicBlendIntoConditionalNegate(
40586 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
40587 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
40588 EVT MaskVT = Mask.getValueType();
40589 assert(MaskVT.isInteger() &&((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40591, __PRETTY_FUNCTION__))
40590 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40591, __PRETTY_FUNCTION__))
40591 "Mask must be zero/all-bits")((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40591, __PRETTY_FUNCTION__))
;
40592
40593 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
40594 return SDValue();
40595 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
40596 return SDValue();
40597
40598 auto IsNegV = [](SDNode *N, SDValue V) {
40599 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
40600 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
40601 };
40602
40603 SDValue V;
40604 if (IsNegV(Y.getNode(), X))
40605 V = X;
40606 else if (IsNegV(X.getNode(), Y))
40607 V = Y;
40608 else
40609 return SDValue();
40610
40611 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
40612 SDValue SubOp2 = Mask;
40613
40614 // If the negate was on the false side of the select, then
40615 // the operands of the SUB need to be swapped. PR 27251.
40616 // This is because the pattern being matched above is
40617 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
40618 // but if the pattern matched was
40619 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
40620 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
40621 // pattern also needs to be a negation of the replacement pattern above.
40622 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
40623 // sub accomplishes the negation of the replacement pattern.
40624 if (V == Y)
40625 std::swap(SubOp1, SubOp2);
40626
40627 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
40628 return DAG.getBitcast(VT, Res);
40629}
40630
40631/// Do target-specific dag combines on SELECT and VSELECT nodes.
40632static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
40633 TargetLowering::DAGCombinerInfo &DCI,
40634 const X86Subtarget &Subtarget) {
40635 SDLoc DL(N);
40636 SDValue Cond = N->getOperand(0);
40637 SDValue LHS = N->getOperand(1);
40638 SDValue RHS = N->getOperand(2);
40639
40640 // Try simplification again because we use this function to optimize
40641 // BLENDV nodes that are not handled by the generic combiner.
40642 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
40643 return V;
40644
40645 EVT VT = LHS.getValueType();
40646 EVT CondVT = Cond.getValueType();
40647 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40648 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
40649
40650 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
40651 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
40652 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
40653 if (CondVT.isVector() && CondVT.isInteger() &&
40654 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
40655 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
40656 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
40657 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
40658 DL, DAG, Subtarget))
40659 return V;
40660
40661 // Convert vselects with constant condition into shuffles.
40662 if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
40663 SmallVector<int, 64> Mask;
40664 if (createShuffleMaskFromVSELECT(Mask, Cond))
40665 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
40666 }
40667
40668 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
40669 // by forcing the unselected elements to zero.
40670 // TODO: Can we handle more shuffles with this?
40671 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
40672 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
40673 LHS.hasOneUse() && RHS.hasOneUse()) {
40674 MVT SimpleVT = VT.getSimpleVT();
40675 bool LHSUnary, RHSUnary;
40676 SmallVector<SDValue, 1> LHSOps, RHSOps;
40677 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
40678 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
40679 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask,
40680 LHSUnary) &&
40681 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask,
40682 RHSUnary)) {
40683 int NumElts = VT.getVectorNumElements();
40684 for (int i = 0; i != NumElts; ++i) {
40685 if (CondMask[i] < NumElts)
40686 RHSMask[i] = 0x80;
40687 else
40688 LHSMask[i] = 0x80;
40689 }
40690 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
40691 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
40692 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
40693 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
40694 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
40695 }
40696 }
40697
40698 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
40699 // instructions match the semantics of the common C idiom x<y?x:y but not
40700 // x<=y?x:y, because of how they handle negative zero (which can be
40701 // ignored in unsafe-math mode).
40702 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
40703 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
40704 VT != MVT::f80 && VT != MVT::f128 &&
40705 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
40706 (Subtarget.hasSSE2() ||
40707 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
40708 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
40709
40710 unsigned Opcode = 0;
40711 // Check for x CC y ? x : y.
40712 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
40713 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
40714 switch (CC) {
40715 default: break;
40716 case ISD::SETULT:
40717 // Converting this to a min would handle NaNs incorrectly, and swapping
40718 // the operands would cause it to handle comparisons between positive
40719 // and negative zero incorrectly.
40720 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
40721 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
40722 !(DAG.isKnownNeverZeroFloat(LHS) ||
40723 DAG.isKnownNeverZeroFloat(RHS)))
40724 break;
40725 std::swap(LHS, RHS);
40726 }
40727 Opcode = X86ISD::FMIN;
40728 break;
40729 case ISD::SETOLE:
40730 // Converting this to a min would handle comparisons between positive
40731 // and negative zero incorrectly.
40732 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
40733 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
40734 break;
40735 Opcode = X86ISD::FMIN;
40736 break;
40737 case ISD::SETULE:
40738 // Converting this to a min would handle both negative zeros and NaNs
40739 // incorrectly, but we can swap the operands to fix both.
40740 std::swap(LHS, RHS);
40741 LLVM_FALLTHROUGH[[gnu::fallthrough]];
40742 case ISD::SETOLT:
40743 case ISD::SETLT:
40744 case ISD::SETLE:
40745 Opcode = X86ISD::FMIN;
40746 break;
40747
40748 case ISD::SETOGE:
40749 // Converting this to a max would handle comparisons between positive
40750 // and negative zero incorrectly.
40751 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
40752 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
40753 break;
40754 Opcode = X86ISD::FMAX;
40755 break;
40756 case ISD::SETUGT:
40757 // Converting this to a max would handle NaNs incorrectly, and swapping
40758 // the operands would cause it to handle comparisons between positive
40759 // and negative zero incorrectly.
40760 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
40761 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
40762 !(DAG.isKnownNeverZeroFloat(LHS) ||
40763 DAG.isKnownNeverZeroFloat(RHS)))
40764 break;
40765 std::swap(LHS, RHS);
40766 }
40767 Opcode = X86ISD::FMAX;
40768 break;
40769 case ISD::SETUGE:
40770 // Converting this to a max would handle both negative zeros and NaNs
40771 // incorrectly, but we can swap the operands to fix both.
40772 std::swap(LHS, RHS);
40773 LLVM_FALLTHROUGH[[gnu::fallthrough]];
40774 case ISD::SETOGT:
40775 case ISD::SETGT:
40776 case ISD::SETGE:
40777 Opcode = X86ISD::FMAX;
40778 break;
40779 }
40780 // Check for x CC y ? y : x -- a min/max with reversed arms.
40781 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
40782 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
40783 switch (CC) {
40784 default: break;
40785 case ISD::SETOGE:
40786 // Converting this to a min would handle comparisons between positive
40787 // and negative zero incorrectly, and swapping the operands would
40788 // cause it to handle NaNs incorrectly.
40789 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
40790 !(DAG.isKnownNeverZeroFloat(LHS) ||
40791 DAG.isKnownNeverZeroFloat(RHS))) {
40792 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
40793 break;
40794 std::swap(LHS, RHS);
40795 }
40796 Opcode = X86ISD::FMIN;
40797 break;
40798 case ISD::SETUGT:
40799 // Converting this to a min would handle NaNs incorrectly.
40800 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
40801 break;
40802 Opcode = X86ISD::FMIN;
40803 break;
40804 case ISD::SETUGE:
40805 // Converting this to a min would handle both negative zeros and NaNs
40806 // incorrectly, but we can swap the operands to fix both.
40807 std::swap(LHS, RHS);
40808 LLVM_FALLTHROUGH[[gnu::fallthrough]];
40809 case ISD::SETOGT:
40810 case ISD::SETGT:
40811 case ISD::SETGE:
40812 Opcode = X86ISD::FMIN;
40813 break;
40814
40815 case ISD::SETULT:
40816 // Converting this to a max would handle NaNs incorrectly.
40817 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
40818 break;
40819 Opcode = X86ISD::FMAX;
40820 break;
40821 case ISD::SETOLE:
40822 // Converting this to a max would handle comparisons between positive
40823 // and negative zero incorrectly, and swapping the operands would
40824 // cause it to handle NaNs incorrectly.
40825 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
40826 !DAG.isKnownNeverZeroFloat(LHS) &&
40827 !DAG.isKnownNeverZeroFloat(RHS)) {
40828 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
40829 break;
40830 std::swap(LHS, RHS);
40831 }
40832 Opcode = X86ISD::FMAX;
40833 break;
40834 case ISD::SETULE:
40835 // Converting this to a max would handle both negative zeros and NaNs
40836 // incorrectly, but we can swap the operands to fix both.
40837 std::swap(LHS, RHS);
40838 LLVM_FALLTHROUGH[[gnu::fallthrough]];
40839 case ISD::SETOLT:
40840 case ISD::SETLT:
40841 case ISD::SETLE:
40842 Opcode = X86ISD::FMAX;
40843 break;
40844 }
40845 }
40846
40847 if (Opcode)
40848 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
40849 }
40850
40851 // Some mask scalar intrinsics rely on checking if only one bit is set
40852 // and implement it in C code like this:
40853 // A[0] = (U & 1) ? A[0] : W[0];
40854 // This creates some redundant instructions that break pattern matching.
40855 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
40856 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
40857 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
40858 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
40859 SDValue AndNode = Cond.getOperand(0);
40860 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
40861 isNullConstant(Cond.getOperand(1)) &&
40862 isOneConstant(AndNode.getOperand(1))) {
40863 // LHS and RHS swapped due to
40864 // setcc outputting 1 when AND resulted in 0 and vice versa.
40865 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
40866 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
40867 }
40868 }
40869
40870 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
40871 // lowering on KNL. In this case we convert it to
40872 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
40873 // The same situation all vectors of i8 and i16 without BWI.
40874 // Make sure we extend these even before type legalization gets a chance to
40875 // split wide vectors.
40876 // Since SKX these selects have a proper lowering.
40877 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
40878 CondVT.getVectorElementType() == MVT::i1 &&
40879 (VT.getVectorElementType() == MVT::i8 ||
40880 VT.getVectorElementType() == MVT::i16)) {
40881 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
40882 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
40883 }
40884
40885 // AVX512 - Extend select with zero to merge with target shuffle.
40886 // select(mask, extract_subvector(shuffle(x)), zero) -->
40887 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
40888 // TODO - support non target shuffles as well.
40889 if (Subtarget.hasAVX512() && CondVT.isVector() &&
40890 CondVT.getVectorElementType() == MVT::i1) {
40891 auto SelectableOp = [&TLI](SDValue Op) {
40892 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40893 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
40894 isNullConstant(Op.getOperand(1)) &&
40895 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
40896 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
40897 };
40898
40899 bool SelectableLHS = SelectableOp(LHS);
40900 bool SelectableRHS = SelectableOp(RHS);
40901 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
40902 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
40903
40904 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
40905 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
40906 : RHS.getOperand(0).getValueType();
40907 unsigned NumSrcElts = SrcVT.getVectorNumElements();
40908 EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
40909 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
40910 VT.getSizeInBits());
40911 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
40912 VT.getSizeInBits());
40913 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
40914 DAG.getUNDEF(SrcCondVT), Cond,
40915 DAG.getIntPtrConstant(0, DL));
40916 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
40917 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
40918 }
40919 }
40920
40921 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
40922 return V;
40923
40924 // Canonicalize min/max:
40925 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
40926 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
40927 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
40928 // the need for an extra compare against zero. e.g.
40929 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
40930 // subl %esi, %edi
40931 // testl %edi, %edi
40932 // movl $0, %eax
40933 // cmovgl %edi, %eax
40934 // =>
40935 // xorl %eax, %eax
40936 // subl %esi, $edi
40937 // cmovsl %eax, %edi
40938 //
40939 // We can also canonicalize
40940 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
40941 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
40942 // This allows the use of a test instruction for the compare.
40943 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
40944 Cond.hasOneUse() &&
40945 LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
40946 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
40947 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
40948 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
40949 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
40950 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
40951 Cond.getOperand(0), Cond.getOperand(1), NewCC);
40952 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
40953 }
40954 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
40955 ISD::CondCode NewCC = ISD::SETUGE;
40956 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
40957 Cond.getOperand(0), Cond.getOperand(1), NewCC);
40958 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
40959 }
40960 }
40961
40962 // Check if the first operand is all zeros and Cond type is vXi1.
40963 // If this an avx512 target we can improve the use of zero masking by
40964 // swapping the operands and inverting the condition.
40965 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
40966 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
40967 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
40968 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
40969 // Invert the cond to not(cond) : xor(op,allones)=not(op)
40970 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
40971 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
40972 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
40973 }
40974
40975 // Early exit check
40976 if (!TLI.isTypeLegal(VT))
40977 return SDValue();
40978
40979 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
40980 return V;
40981
40982 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
40983 return V;
40984
40985 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
40986 return V;
40987
40988 // select(~Cond, X, Y) -> select(Cond, Y, X)
40989 if (CondVT.getScalarType() != MVT::i1) {
40990 if (SDValue CondNot = IsNOT(Cond, DAG))
40991 return DAG.getNode(N->getOpcode(), DL, VT,
40992 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
40993 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
40994 if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
40995 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
40996 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
40997 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
40998 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
40999 }
41000 }
41001
41002 // Try to optimize vXi1 selects if both operands are either all constants or
41003 // bitcasts from scalar integer type. In that case we can convert the operands
41004 // to integer and use an integer select which will be converted to a CMOV.
41005 // We need to take a little bit of care to avoid creating an i64 type after
41006 // type legalization.
41007 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
41008 VT.getVectorElementType() == MVT::i1 &&
41009 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
41010 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
41011 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
41012 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
41013
41014 if ((LHSIsConst ||
41015 (LHS.getOpcode() == ISD::BITCAST &&
41016 LHS.getOperand(0).getValueType() == IntVT)) &&
41017 (RHSIsConst ||
41018 (RHS.getOpcode() == ISD::BITCAST &&
41019 RHS.getOperand(0).getValueType() == IntVT))) {
41020 if (LHSIsConst)
41021 LHS = combinevXi1ConstantToInteger(LHS, DAG);
41022 else
41023 LHS = LHS.getOperand(0);
41024
41025 if (RHSIsConst)
41026 RHS = combinevXi1ConstantToInteger(RHS, DAG);
41027 else
41028 RHS = RHS.getOperand(0);
41029
41030 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
41031 return DAG.getBitcast(VT, Select);
41032 }
41033 }
41034
41035 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
41036 // single bits, then invert the predicate and swap the select operands.
41037 // This can lower using a vector shift bit-hack rather than mask and compare.
41038 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
41039 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
41040 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
41041 Cond.getOperand(0).getOpcode() == ISD::AND &&
41042 isNullOrNullSplat(Cond.getOperand(1)) &&
41043 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
41044 Cond.getOperand(0).getValueType() == VT) {
41045 // The 'and' mask must be composed of power-of-2 constants.
41046 SDValue And = Cond.getOperand(0);
41047 auto *C = isConstOrConstSplat(And.getOperand(1));
41048 if (C && C->getAPIntValue().isPowerOf2()) {
41049 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
41050 SDValue NotCond =
41051 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
41052 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
41053 }
41054
41055 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
41056 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
41057 // 16-bit lacks a proper blendv.
41058 unsigned EltBitWidth = VT.getScalarSizeInBits();
41059 bool CanShiftBlend =
41060 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
41061 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
41062 (Subtarget.hasXOP()));
41063 if (CanShiftBlend &&
41064 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
41065 return C->getAPIntValue().isPowerOf2();
41066 })) {
41067 // Create a left-shift constant to get the mask bits over to the sign-bit.
41068 SDValue Mask = And.getOperand(1);
41069 SmallVector<int, 32> ShlVals;
41070 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
41071 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
41072 ShlVals.push_back(EltBitWidth - 1 -
41073 MaskVal->getAPIntValue().exactLogBase2());
41074 }
41075 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
41076 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
41077 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
41078 SDValue NewCond =
41079 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
41080 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
41081 }
41082 }
41083
41084 return SDValue();
41085}
41086
41087/// Combine:
41088/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
41089/// to:
41090/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
41091/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
41092/// Note that this is only legal for some op/cc combinations.
41093static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
41094 SelectionDAG &DAG,
41095 const X86Subtarget &Subtarget) {
41096 // This combine only operates on CMP-like nodes.
41097 if (!(Cmp.getOpcode() == X86ISD::CMP ||
41098 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
41099 return SDValue();
41100
41101 // Can't replace the cmp if it has more uses than the one we're looking at.
41102 // FIXME: We would like to be able to handle this, but would need to make sure
41103 // all uses were updated.
41104 if (!Cmp.hasOneUse())
41105 return SDValue();
41106
41107 // This only applies to variations of the common case:
41108 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
41109 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
41110 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
41111 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
41112 // Using the proper condcodes (see below), overflow is checked for.
41113
41114 // FIXME: We can generalize both constraints:
41115 // - XOR/OR/AND (if they were made to survive AtomicExpand)
41116 // - LHS != 1
41117 // if the result is compared.
41118
41119 SDValue CmpLHS = Cmp.getOperand(0);
41120 SDValue CmpRHS = Cmp.getOperand(1);
41121
41122 if (!CmpLHS.hasOneUse())
41123 return SDValue();
41124
41125 unsigned Opc = CmpLHS.getOpcode();
41126 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
41127 return SDValue();
41128
41129 SDValue OpRHS = CmpLHS.getOperand(2);
41130 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
41131 if (!OpRHSC)
41132 return SDValue();
41133
41134 APInt Addend = OpRHSC->getAPIntValue();
41135 if (Opc == ISD::ATOMIC_LOAD_SUB)
41136 Addend = -Addend;
41137
41138 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
41139 if (!CmpRHSC)
41140 return SDValue();
41141
41142 APInt Comparison = CmpRHSC->getAPIntValue();
41143
41144 // If the addend is the negation of the comparison value, then we can do
41145 // a full comparison by emitting the atomic arithmetic as a locked sub.
41146 if (Comparison == -Addend) {
41147 // The CC is fine, but we need to rewrite the LHS of the comparison as an
41148 // atomic sub.
41149 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
41150 auto AtomicSub = DAG.getAtomic(
41151 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
41152 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
41153 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
41154 AN->getMemOperand());
41155 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
41156 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
41157 DAG.getUNDEF(CmpLHS.getValueType()));
41158 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
41159 return LockOp;
41160 }
41161
41162 // We can handle comparisons with zero in a number of cases by manipulating
41163 // the CC used.
41164 if (!Comparison.isNullValue())
41165 return SDValue();
41166
41167 if (CC == X86::COND_S && Addend == 1)
41168 CC = X86::COND_LE;
41169 else if (CC == X86::COND_NS && Addend == 1)
41170 CC = X86::COND_G;
41171 else if (CC == X86::COND_G && Addend == -1)
41172 CC = X86::COND_GE;
41173 else if (CC == X86::COND_LE && Addend == -1)
41174 CC = X86::COND_L;
41175 else
41176 return SDValue();
41177
41178 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
41179 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
41180 DAG.getUNDEF(CmpLHS.getValueType()));
41181 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
41182 return LockOp;
41183}
41184
41185// Check whether a boolean test is testing a boolean value generated by
41186// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
41187// code.
41188//
41189// Simplify the following patterns:
41190// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
41191// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
41192// to (Op EFLAGS Cond)
41193//
41194// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
41195// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
41196// to (Op EFLAGS !Cond)
41197//
41198// where Op could be BRCOND or CMOV.
41199//
41200static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
41201 // This combine only operates on CMP-like nodes.
41202 if (!(Cmp.getOpcode() == X86ISD::CMP ||
41203 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
41204 return SDValue();
41205
41206 // Quit if not used as a boolean value.
41207 if (CC != X86::COND_E && CC != X86::COND_NE)
41208 return SDValue();
41209
41210 // Check CMP operands. One of them should be 0 or 1 and the other should be
41211 // an SetCC or extended from it.
41212 SDValue Op1 = Cmp.getOperand(0);
41213 SDValue Op2 = Cmp.getOperand(1);
41214
41215 SDValue SetCC;
41216 const ConstantSDNode* C = nullptr;
41217 bool needOppositeCond = (CC == X86::COND_E);
41218 bool checkAgainstTrue = false; // Is it a comparison against 1?
41219
41220 if ((C = dyn_cast<ConstantSDNode>(Op1)))
41221 SetCC = Op2;
41222 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
41223 SetCC = Op1;
41224 else // Quit if all operands are not constants.
41225 return SDValue();
41226
41227 if (C->getZExtValue() == 1) {
41228 needOppositeCond = !needOppositeCond;
41229 checkAgainstTrue = true;
41230 } else if (C->getZExtValue() != 0)
41231 // Quit if the constant is neither 0 or 1.
41232 return SDValue();
41233
41234 bool truncatedToBoolWithAnd = false;
41235 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
41236 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
41237 SetCC.getOpcode() == ISD::TRUNCATE ||
41238 SetCC.getOpcode() == ISD::AND) {
41239 if (SetCC.getOpcode() == ISD::AND) {
41240 int OpIdx = -1;
41241 if (isOneConstant(SetCC.getOperand(0)))
41242 OpIdx = 1;
41243 if (isOneConstant(SetCC.getOperand(1)))
41244 OpIdx = 0;
41245 if (OpIdx < 0)
41246 break;
41247 SetCC = SetCC.getOperand(OpIdx);
41248 truncatedToBoolWithAnd = true;
41249 } else
41250 SetCC = SetCC.getOperand(0);
41251 }
41252
41253 switch (SetCC.getOpcode()) {
41254 case X86ISD::SETCC_CARRY:
41255 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
41256 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
41257 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
41258 // truncated to i1 using 'and'.
41259 if (checkAgainstTrue && !truncatedToBoolWithAnd)
41260 break;
41261 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41262, __PRETTY_FUNCTION__))
41262 "Invalid use of SETCC_CARRY!")((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41262, __PRETTY_FUNCTION__))
;
41263 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41264 case X86ISD::SETCC:
41265 // Set the condition code or opposite one if necessary.
41266 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
41267 if (needOppositeCond)
41268 CC = X86::GetOppositeBranchCondition(CC);
41269 return SetCC.getOperand(1);
41270 case X86ISD::CMOV: {
41271 // Check whether false/true value has canonical one, i.e. 0 or 1.
41272 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
41273 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
41274 // Quit if true value is not a constant.
41275 if (!TVal)
41276 return SDValue();
41277 // Quit if false value is not a constant.
41278 if (!FVal) {
41279 SDValue Op = SetCC.getOperand(0);
41280 // Skip 'zext' or 'trunc' node.
41281 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
41282 Op.getOpcode() == ISD::TRUNCATE)
41283 Op = Op.getOperand(0);
41284 // A special case for rdrand/rdseed, where 0 is set if false cond is
41285 // found.
41286 if ((Op.getOpcode() != X86ISD::RDRAND &&
41287 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
41288 return SDValue();
41289 }
41290 // Quit if false value is not the constant 0 or 1.
41291 bool FValIsFalse = true;
41292 if (FVal && FVal->getZExtValue() != 0) {
41293 if (FVal->getZExtValue() != 1)
41294 return SDValue();
41295 // If FVal is 1, opposite cond is needed.
41296 needOppositeCond = !needOppositeCond;
41297 FValIsFalse = false;
41298 }
41299 // Quit if TVal is not the constant opposite of FVal.
41300 if (FValIsFalse && TVal->getZExtValue() != 1)
41301 return SDValue();
41302 if (!FValIsFalse && TVal->getZExtValue() != 0)
41303 return SDValue();
41304 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
41305 if (needOppositeCond)
41306 CC = X86::GetOppositeBranchCondition(CC);
41307 return SetCC.getOperand(3);
41308 }
41309 }
41310
41311 return SDValue();
41312}
41313
41314/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
41315/// Match:
41316/// (X86or (X86setcc) (X86setcc))
41317/// (X86cmp (and (X86setcc) (X86setcc)), 0)
41318static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
41319 X86::CondCode &CC1, SDValue &Flags,
41320 bool &isAnd) {
41321 if (Cond->getOpcode() == X86ISD::CMP) {
41322 if (!isNullConstant(Cond->getOperand(1)))
41323 return false;
41324
41325 Cond = Cond->getOperand(0);
41326 }
41327
41328 isAnd = false;
41329
41330 SDValue SetCC0, SetCC1;
41331 switch (Cond->getOpcode()) {
41332 default: return false;
41333 case ISD::AND:
41334 case X86ISD::AND:
41335 isAnd = true;
41336 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41337 case ISD::OR:
41338 case X86ISD::OR:
41339 SetCC0 = Cond->getOperand(0);
41340 SetCC1 = Cond->getOperand(1);
41341 break;
41342 };
41343
41344 // Make sure we have SETCC nodes, using the same flags value.
41345 if (SetCC0.getOpcode() != X86ISD::SETCC ||
41346 SetCC1.getOpcode() != X86ISD::SETCC ||
41347 SetCC0->getOperand(1) != SetCC1->getOperand(1))
41348 return false;
41349
41350 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
41351 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
41352 Flags = SetCC0->getOperand(1);
41353 return true;
41354}
41355
41356// When legalizing carry, we create carries via add X, -1
41357// If that comes from an actual carry, via setcc, we use the
41358// carry directly.
41359static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
41360 if (EFLAGS.getOpcode() == X86ISD::ADD) {
41361 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
41362 SDValue Carry = EFLAGS.getOperand(0);
41363 while (Carry.getOpcode() == ISD::TRUNCATE ||
41364 Carry.getOpcode() == ISD::ZERO_EXTEND ||
41365 Carry.getOpcode() == ISD::SIGN_EXTEND ||
41366 Carry.getOpcode() == ISD::ANY_EXTEND ||
41367 (Carry.getOpcode() == ISD::AND &&
41368 isOneConstant(Carry.getOperand(1))))
41369 Carry = Carry.getOperand(0);
41370 if (Carry.getOpcode() == X86ISD::SETCC ||
41371 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
41372 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
41373 uint64_t CarryCC = Carry.getConstantOperandVal(0);
41374 SDValue CarryOp1 = Carry.getOperand(1);
41375 if (CarryCC == X86::COND_B)
41376 return CarryOp1;
41377 if (CarryCC == X86::COND_A) {
41378 // Try to convert COND_A into COND_B in an attempt to facilitate
41379 // materializing "setb reg".
41380 //
41381 // Do not flip "e > c", where "c" is a constant, because Cmp
41382 // instruction cannot take an immediate as its first operand.
41383 //
41384 if (CarryOp1.getOpcode() == X86ISD::SUB &&
41385 CarryOp1.getNode()->hasOneUse() &&
41386 CarryOp1.getValueType().isInteger() &&
41387 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
41388 SDValue SubCommute =
41389 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
41390 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
41391 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
41392 }
41393 }
41394 // If this is a check of the z flag of an add with 1, switch to the
41395 // C flag.
41396 if (CarryCC == X86::COND_E &&
41397 CarryOp1.getOpcode() == X86ISD::ADD &&
41398 isOneConstant(CarryOp1.getOperand(1)))
41399 return CarryOp1;
41400 }
41401 }
41402 }
41403
41404 return SDValue();
41405}
41406
41407/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
41408/// to avoid the inversion.
41409static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
41410 SelectionDAG &DAG,
41411 const X86Subtarget &Subtarget) {
41412 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
41413 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
41414 EFLAGS.getOpcode() != X86ISD::TESTP)
41415 return SDValue();
41416
41417 // PTEST/TESTP sets EFLAGS as:
41418 // TESTZ: ZF = (Op0 & Op1) == 0
41419 // TESTC: CF = (~Op0 & Op1) == 0
41420 // TESTNZC: ZF == 0 && CF == 0
41421 EVT VT = EFLAGS.getValueType();
41422 SDValue Op0 = EFLAGS.getOperand(0);
41423 SDValue Op1 = EFLAGS.getOperand(1);
41424 EVT OpVT = Op0.getValueType();
41425
41426 // TEST*(~X,Y) == TEST*(X,Y)
41427 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
41428 X86::CondCode InvCC;
41429 switch (CC) {
41430 case X86::COND_B:
41431 // testc -> testz.
41432 InvCC = X86::COND_E;
41433 break;
41434 case X86::COND_AE:
41435 // !testc -> !testz.
41436 InvCC = X86::COND_NE;
41437 break;
41438 case X86::COND_E:
41439 // testz -> testc.
41440 InvCC = X86::COND_B;
41441 break;
41442 case X86::COND_NE:
41443 // !testz -> !testc.
41444 InvCC = X86::COND_AE;
41445 break;
41446 case X86::COND_A:
41447 case X86::COND_BE:
41448 // testnzc -> testnzc (no change).
41449 InvCC = CC;
41450 break;
41451 default:
41452 InvCC = X86::COND_INVALID;
41453 break;
41454 }
41455
41456 if (InvCC != X86::COND_INVALID) {
41457 CC = InvCC;
41458 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
41459 DAG.getBitcast(OpVT, NotOp0), Op1);
41460 }
41461 }
41462
41463 if (CC == X86::COND_E || CC == X86::COND_NE) {
41464 // TESTZ(X,~Y) == TESTC(Y,X)
41465 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
41466 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
41467 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
41468 DAG.getBitcast(OpVT, NotOp1), Op0);
41469 }
41470
41471 if (Op0 == Op1) {
41472 SDValue BC = peekThroughBitcasts(Op0);
41473 EVT BCVT = BC.getValueType();
41474 assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&((BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal
(BCVT) && "Unexpected vector type") ? static_cast<
void> (0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41475, __PRETTY_FUNCTION__))
41475 "Unexpected vector type")((BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal
(BCVT) && "Unexpected vector type") ? static_cast<
void> (0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41475, __PRETTY_FUNCTION__))
;
41476
41477 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
41478 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
41479 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
41480 DAG.getBitcast(OpVT, BC.getOperand(0)),
41481 DAG.getBitcast(OpVT, BC.getOperand(1)));
41482 }
41483
41484 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
41485 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
41486 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
41487 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
41488 DAG.getBitcast(OpVT, BC.getOperand(0)),
41489 DAG.getBitcast(OpVT, BC.getOperand(1)));
41490 }
41491
41492 // If every element is an all-sign value, see if we can use MOVMSK to
41493 // more efficiently extract the sign bits and compare that.
41494 // TODO: Handle TESTC with comparison inversion.
41495 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
41496 // MOVMSK combines to make sure its never worse than PTEST?
41497 unsigned EltBits = BCVT.getScalarSizeInBits();
41498 if (DAG.ComputeNumSignBits(BC) == EltBits) {
41499 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")((VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41499, __PRETTY_FUNCTION__))
;
41500 APInt SignMask = APInt::getSignMask(EltBits);
41501 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41502 if (SDValue Res =
41503 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
41504 // For vXi16 cases we need to use pmovmksb and extract every other
41505 // sign bit.
41506 SDLoc DL(EFLAGS);
41507 if (EltBits == 16) {
41508 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
41509 Res = DAG.getBitcast(MovmskVT, Res);
41510 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
41511 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
41512 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
41513 } else {
41514 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
41515 }
41516 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
41517 DAG.getConstant(0, DL, MVT::i32));
41518 }
41519 }
41520 }
41521
41522 // TESTZ(-1,X) == TESTZ(X,X)
41523 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
41524 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
41525
41526 // TESTZ(X,-1) == TESTZ(X,X)
41527 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
41528 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
41529 }
41530
41531 return SDValue();
41532}
41533
41534// Attempt to simplify the MOVMSK input based on the comparison type.
41535static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
41536 SelectionDAG &DAG,
41537 const X86Subtarget &Subtarget) {
41538 // Handle eq/ne against zero (any_of).
41539 // Handle eq/ne against -1 (all_of).
41540 if (!(CC == X86::COND_E || CC == X86::COND_NE))
41541 return SDValue();
41542 if (EFLAGS.getValueType() != MVT::i32)
41543 return SDValue();
41544 unsigned CmpOpcode = EFLAGS.getOpcode();
41545 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
41546 return SDValue();
41547 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
41548 if (!CmpConstant)
41549 return SDValue();
41550 const APInt &CmpVal = CmpConstant->getAPIntValue();
41551
41552 SDValue CmpOp = EFLAGS.getOperand(0);
41553 unsigned CmpBits = CmpOp.getValueSizeInBits();
41554 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")((CmpBits == CmpVal.getBitWidth() && "Value size mismatch"
) ? static_cast<void> (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41554, __PRETTY_FUNCTION__))
;
41555
41556 // Peek through any truncate.
41557 if (CmpOp.getOpcode() == ISD::TRUNCATE)
41558 CmpOp = CmpOp.getOperand(0);
41559
41560 // Bail if we don't find a MOVMSK.
41561 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
41562 return SDValue();
41563
41564 SDValue Vec = CmpOp.getOperand(0);
41565 MVT VecVT = Vec.getSimpleValueType();
41566 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
"Unexpected MOVMSK operand") ? static_cast<void> (0) :
__assert_fail ("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41567, __PRETTY_FUNCTION__))
41567 "Unexpected MOVMSK operand")(((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
"Unexpected MOVMSK operand") ? static_cast<void> (0) :
__assert_fail ("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41567, __PRETTY_FUNCTION__))
;
41568 unsigned NumElts = VecVT.getVectorNumElements();
41569 unsigned NumEltBits = VecVT.getScalarSizeInBits();
41570
41571 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
41572 bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
41573 CmpVal.isMask(NumElts);
41574 if (!IsAnyOf && !IsAllOf)
41575 return SDValue();
41576
41577 // See if we can peek through to a vector with a wider element type, if the
41578 // signbits extend down to all the sub-elements as well.
41579 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
41580 // potential SimplifyDemandedBits/Elts cases.
41581 if (Vec.getOpcode() == ISD::BITCAST) {
41582 SDValue BC = peekThroughBitcasts(Vec);
41583 MVT BCVT = BC.getSimpleValueType();
41584 unsigned BCNumElts = BCVT.getVectorNumElements();
41585 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
41586 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
41587 BCNumEltBits > NumEltBits &&
41588 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
41589 SDLoc DL(EFLAGS);
41590 unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
41591 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
41592 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
41593 DAG.getConstant(CmpMask, DL, MVT::i32));
41594 }
41595 }
41596
41597 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
41598 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
41599 if (IsAllOf && Subtarget.hasSSE41()) {
41600 SDValue BC = peekThroughBitcasts(Vec);
41601 if (BC.getOpcode() == X86ISD::PCMPEQ &&
41602 ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
41603 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
41604 SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
41605 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
41606 }
41607 }
41608
41609 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
41610 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
41611 // sign bits prior to the comparison with zero unless we know that
41612 // the vXi16 splats the sign bit down to the lower i8 half.
41613 // TODO: Handle all_of patterns.
41614 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
41615 SDValue VecOp0 = Vec.getOperand(0);
41616 SDValue VecOp1 = Vec.getOperand(1);
41617 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
41618 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
41619 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
41620 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
41621 SDLoc DL(EFLAGS);
41622 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
41623 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
41624 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
41625 if (!SignExt0) {
41626 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
41627 DAG.getConstant(0xAAAA, DL, MVT::i16));
41628 }
41629 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
41630 DAG.getConstant(0, DL, MVT::i16));
41631 }
41632 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
41633 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
41634 if (CmpBits == 16 && Subtarget.hasInt256() &&
41635 VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41636 VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41637 VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
41638 VecOp0.getConstantOperandAPInt(1) == 0 &&
41639 VecOp1.getConstantOperandAPInt(1) == 8 &&
41640 (IsAnyOf || (SignExt0 && SignExt1))) {
41641 SDLoc DL(EFLAGS);
41642 SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
41643 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
41644 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
41645 if (!SignExt0 || !SignExt1) {
41646 assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns")((IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? static_cast<void> (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41646, __PRETTY_FUNCTION__))
;
41647 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
41648 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
41649 }
41650 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
41651 DAG.getConstant(CmpMask, DL, MVT::i32));
41652 }
41653 }
41654
41655 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
41656 SmallVector<int, 32> ShuffleMask;
41657 SmallVector<SDValue, 2> ShuffleInputs;
41658 if (NumElts == CmpBits &&
41659 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
41660 ShuffleMask, DAG) &&
41661 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
41662 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
41663 unsigned NumShuffleElts = ShuffleMask.size();
41664 APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
41665 for (int M : ShuffleMask) {
41666 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")((0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index"
) ? static_cast<void> (0) : __assert_fail ("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41666, __PRETTY_FUNCTION__))
;
41667 DemandedElts.setBit(M);
41668 }
41669 if (DemandedElts.isAllOnesValue()) {
41670 SDLoc DL(EFLAGS);
41671 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
41672 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
41673 Result =
41674 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
41675 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
41676 EFLAGS.getOperand(1));
41677 }
41678 }
41679
41680 return SDValue();
41681}
41682
41683/// Optimize an EFLAGS definition used according to the condition code \p CC
41684/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
41685/// uses of chain values.
41686static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
41687 SelectionDAG &DAG,
41688 const X86Subtarget &Subtarget) {
41689 if (CC == X86::COND_B)
41690 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
41691 return Flags;
41692
41693 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
41694 return R;
41695
41696 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
41697 return R;
41698
41699 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
41700 return R;
41701
41702 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
41703}
41704
41705/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
41706static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
41707 TargetLowering::DAGCombinerInfo &DCI,
41708 const X86Subtarget &Subtarget) {
41709 SDLoc DL(N);
41710
41711 SDValue FalseOp = N->getOperand(0);
41712 SDValue TrueOp = N->getOperand(1);
41713 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
41714 SDValue Cond = N->getOperand(3);
41715
41716 // cmov X, X, ?, ? --> X
41717 if (TrueOp == FalseOp)
41718 return TrueOp;
41719
41720 // Try to simplify the EFLAGS and condition code operands.
41721 // We can't always do this as FCMOV only supports a subset of X86 cond.
41722 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
41723 if (!(FalseOp.getValueType() == MVT::f80 ||
41724 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
41725 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
41726 !Subtarget.hasCMov() || hasFPCMov(CC)) {
41727 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
41728 Flags};
41729 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
41730 }
41731 }
41732
41733 // If this is a select between two integer constants, try to do some
41734 // optimizations. Note that the operands are ordered the opposite of SELECT
41735 // operands.
41736 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
41737 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
41738 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
41739 // larger than FalseC (the false value).
41740 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
41741 CC = X86::GetOppositeBranchCondition(CC);
41742 std::swap(TrueC, FalseC);
41743 std::swap(TrueOp, FalseOp);
41744 }
41745
41746 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
41747 // This is efficient for any integer data type (including i8/i16) and
41748 // shift amount.
41749 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
41750 Cond = getSETCC(CC, Cond, DL, DAG);
41751
41752 // Zero extend the condition if needed.
41753 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
41754
41755 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
41756 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
41757 DAG.getConstant(ShAmt, DL, MVT::i8));
41758 return Cond;
41759 }
41760
41761 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
41762 // for any integer data type, including i8/i16.
41763 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
41764 Cond = getSETCC(CC, Cond, DL, DAG);
41765
41766 // Zero extend the condition if needed.
41767 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
41768 FalseC->getValueType(0), Cond);
41769 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
41770 SDValue(FalseC, 0));
41771 return Cond;
41772 }
41773
41774 // Optimize cases that will turn into an LEA instruction. This requires
41775 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
41776 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
41777 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
41778 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&((Diff.getBitWidth() == N->getValueType(0).getSizeInBits()
&& "Implicit constant truncation") ? static_cast<
void> (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41779, __PRETTY_FUNCTION__))
41779 "Implicit constant truncation")((Diff.getBitWidth() == N->getValueType(0).getSizeInBits()
&& "Implicit constant truncation") ? static_cast<
void> (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41779, __PRETTY_FUNCTION__))
;
41780
41781 bool isFastMultiplier = false;
41782 if (Diff.ult(10)) {
41783 switch (Diff.getZExtValue()) {
41784 default: break;
41785 case 1: // result = add base, cond
41786 case 2: // result = lea base( , cond*2)
41787 case 3: // result = lea base(cond, cond*2)
41788 case 4: // result = lea base( , cond*4)
41789 case 5: // result = lea base(cond, cond*4)
41790 case 8: // result = lea base( , cond*8)
41791 case 9: // result = lea base(cond, cond*8)
41792 isFastMultiplier = true;
41793 break;
41794 }
41795 }
41796
41797 if (isFastMultiplier) {
41798 Cond = getSETCC(CC, Cond, DL ,DAG);
41799 // Zero extend the condition if needed.
41800 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
41801 Cond);
41802 // Scale the condition by the difference.
41803 if (Diff != 1)
41804 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
41805 DAG.getConstant(Diff, DL, Cond.getValueType()));
41806
41807 // Add the base if non-zero.
41808 if (FalseC->getAPIntValue() != 0)
41809 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
41810 SDValue(FalseC, 0));
41811 return Cond;
41812 }
41813 }
41814 }
41815 }
41816
41817 // Handle these cases:
41818 // (select (x != c), e, c) -> select (x != c), e, x),
41819 // (select (x == c), c, e) -> select (x == c), x, e)
41820 // where the c is an integer constant, and the "select" is the combination
41821 // of CMOV and CMP.
41822 //
41823 // The rationale for this change is that the conditional-move from a constant
41824 // needs two instructions, however, conditional-move from a register needs
41825 // only one instruction.
41826 //
41827 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
41828 // some instruction-combining opportunities. This opt needs to be
41829 // postponed as late as possible.
41830 //
41831 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
41832 // the DCI.xxxx conditions are provided to postpone the optimization as
41833 // late as possible.
41834
41835 ConstantSDNode *CmpAgainst = nullptr;
41836 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
41837 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
41838 !isa<ConstantSDNode>(Cond.getOperand(0))) {
41839
41840 if (CC == X86::COND_NE &&
41841 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
41842 CC = X86::GetOppositeBranchCondition(CC);
41843 std::swap(TrueOp, FalseOp);
41844 }
41845
41846 if (CC == X86::COND_E &&
41847 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
41848 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
41849 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
41850 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
41851 }
41852 }
41853 }
41854
41855 // Fold and/or of setcc's to double CMOV:
41856 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
41857 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
41858 //
41859 // This combine lets us generate:
41860 // cmovcc1 (jcc1 if we don't have CMOV)
41861 // cmovcc2 (same)
41862 // instead of:
41863 // setcc1
41864 // setcc2
41865 // and/or
41866 // cmovne (jne if we don't have CMOV)
41867 // When we can't use the CMOV instruction, it might increase branch
41868 // mispredicts.
41869 // When we can use CMOV, or when there is no mispredict, this improves
41870 // throughput and reduces register pressure.
41871 //
41872 if (CC == X86::COND_NE) {
41873 SDValue Flags;
41874 X86::CondCode CC0, CC1;
41875 bool isAndSetCC;
41876 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
41877 if (isAndSetCC) {
41878 std::swap(FalseOp, TrueOp);
41879 CC0 = X86::GetOppositeBranchCondition(CC0);
41880 CC1 = X86::GetOppositeBranchCondition(CC1);
41881 }
41882
41883 SDValue LOps[] = {FalseOp, TrueOp,
41884 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
41885 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
41886 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
41887 Flags};
41888 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
41889 return CMOV;
41890 }
41891 }
41892
41893 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
41894 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
41895 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
41896 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
41897 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
41898 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
41899 SDValue Add = TrueOp;
41900 SDValue Const = FalseOp;
41901 // Canonicalize the condition code for easier matching and output.
41902 if (CC == X86::COND_E)
41903 std::swap(Add, Const);
41904
41905 // We might have replaced the constant in the cmov with the LHS of the
41906 // compare. If so change it to the RHS of the compare.
41907 if (Const == Cond.getOperand(0))
41908 Const = Cond.getOperand(1);
41909
41910 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
41911 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
41912 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
41913 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
41914 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
41915 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
41916 EVT VT = N->getValueType(0);
41917 // This should constant fold.
41918 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
41919 SDValue CMov =
41920 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
41921 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
41922 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
41923 }
41924 }
41925
41926 return SDValue();
41927}
41928
41929/// Different mul shrinking modes.
41930enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
41931
41932static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
41933 EVT VT = N->getOperand(0).getValueType();
41934 if (VT.getScalarSizeInBits() != 32)
41935 return false;
41936
41937 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")((N->getNumOperands() == 2 && "NumOperands of Mul are 2"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41937, __PRETTY_FUNCTION__))
;
41938 unsigned SignBits[2] = {1, 1};
41939 bool IsPositive[2] = {false, false};
41940 for (unsigned i = 0; i < 2; i++) {
41941 SDValue Opd = N->getOperand(i);
41942
41943 SignBits[i] = DAG.ComputeNumSignBits(Opd);
41944 IsPositive[i] = DAG.SignBitIsZero(Opd);
41945 }
41946
41947 bool AllPositive = IsPositive[0] && IsPositive[1];
41948 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
41949 // When ranges are from -128 ~ 127, use MULS8 mode.
41950 if (MinSignBits >= 25)
41951 Mode = ShrinkMode::MULS8;
41952 // When ranges are from 0 ~ 255, use MULU8 mode.
41953 else if (AllPositive && MinSignBits >= 24)
41954 Mode = ShrinkMode::MULU8;
41955 // When ranges are from -32768 ~ 32767, use MULS16 mode.
41956 else if (MinSignBits >= 17)
41957 Mode = ShrinkMode::MULS16;
41958 // When ranges are from 0 ~ 65535, use MULU16 mode.
41959 else if (AllPositive && MinSignBits >= 16)
41960 Mode = ShrinkMode::MULU16;
41961 else
41962 return false;
41963 return true;
41964}
41965
41966/// When the operands of vector mul are extended from smaller size values,
41967/// like i8 and i16, the type of mul may be shrinked to generate more
41968/// efficient code. Two typical patterns are handled:
41969/// Pattern1:
41970/// %2 = sext/zext <N x i8> %1 to <N x i32>
41971/// %4 = sext/zext <N x i8> %3 to <N x i32>
41972// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
41973/// %5 = mul <N x i32> %2, %4
41974///
41975/// Pattern2:
41976/// %2 = zext/sext <N x i16> %1 to <N x i32>
41977/// %4 = zext/sext <N x i16> %3 to <N x i32>
41978/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
41979/// %5 = mul <N x i32> %2, %4
41980///
41981/// There are four mul shrinking modes:
41982/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
41983/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
41984/// generate pmullw+sext32 for it (MULS8 mode).
41985/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
41986/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
41987/// generate pmullw+zext32 for it (MULU8 mode).
41988/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
41989/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
41990/// generate pmullw+pmulhw for it (MULS16 mode).
41991/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
41992/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
41993/// generate pmullw+pmulhuw for it (MULU16 mode).
41994static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
41995 const X86Subtarget &Subtarget) {
41996 // Check for legality
41997 // pmullw/pmulhw are not supported by SSE.
41998 if (!Subtarget.hasSSE2())
41999 return SDValue();
42000
42001 // Check for profitability
42002 // pmulld is supported since SSE41. It is better to use pmulld
42003 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
42004 // the expansion.
42005 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
42006 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
42007 return SDValue();
42008
42009 ShrinkMode Mode;
42010 if (!canReduceVMulWidth(N, DAG, Mode))
42011 return SDValue();
42012
42013 SDLoc DL(N);
42014 SDValue N0 = N->getOperand(0);
42015 SDValue N1 = N->getOperand(1);
42016 EVT VT = N->getOperand(0).getValueType();
42017 unsigned NumElts = VT.getVectorNumElements();
42018 if ((NumElts % 2) != 0)
42019 return SDValue();
42020
42021 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
42022
42023 // Shrink the operands of mul.
42024 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
42025 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
42026
42027 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
42028 // lower part is needed.
42029 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
42030 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
42031 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
42032 : ISD::SIGN_EXTEND,
42033 DL, VT, MulLo);
42034
42035 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
42036 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
42037 // the higher part is also needed.
42038 SDValue MulHi =
42039 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
42040 ReducedVT, NewN0, NewN1);
42041
42042 // Repack the lower part and higher part result of mul into a wider
42043 // result.
42044 // Generate shuffle functioning as punpcklwd.
42045 SmallVector<int, 16> ShuffleMask(NumElts);
42046 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
42047 ShuffleMask[2 * i] = i;
42048 ShuffleMask[2 * i + 1] = i + NumElts;
42049 }
42050 SDValue ResLo =
42051 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
42052 ResLo = DAG.getBitcast(ResVT, ResLo);
42053 // Generate shuffle functioning as punpckhwd.
42054 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
42055 ShuffleMask[2 * i] = i + NumElts / 2;
42056 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
42057 }
42058 SDValue ResHi =
42059 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
42060 ResHi = DAG.getBitcast(ResVT, ResHi);
42061 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
42062}
42063
42064static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
42065 EVT VT, const SDLoc &DL) {
42066
42067 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
42068 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
42069 DAG.getConstant(Mult, DL, VT));
42070 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
42071 DAG.getConstant(Shift, DL, MVT::i8));
42072 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
42073 N->getOperand(0));
42074 return Result;
42075 };
42076
42077 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
42078 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
42079 DAG.getConstant(Mul1, DL, VT));
42080 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
42081 DAG.getConstant(Mul2, DL, VT));
42082 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
42083 N->getOperand(0));
42084 return Result;
42085 };
42086
42087 switch (MulAmt) {
42088 default:
42089 break;
42090 case 11:
42091 // mul x, 11 => add ((shl (mul x, 5), 1), x)
42092 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
42093 case 21:
42094 // mul x, 21 => add ((shl (mul x, 5), 2), x)
42095 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
42096 case 41:
42097 // mul x, 41 => add ((shl (mul x, 5), 3), x)
42098 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
42099 case 22:
42100 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
42101 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
42102 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
42103 case 19:
42104 // mul x, 19 => add ((shl (mul x, 9), 1), x)
42105 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
42106 case 37:
42107 // mul x, 37 => add ((shl (mul x, 9), 2), x)
42108 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
42109 case 73:
42110 // mul x, 73 => add ((shl (mul x, 9), 3), x)
42111 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
42112 case 13:
42113 // mul x, 13 => add ((shl (mul x, 3), 2), x)
42114 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
42115 case 23:
42116 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
42117 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
42118 case 26:
42119 // mul x, 26 => add ((mul (mul x, 5), 5), x)
42120 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
42121 case 28:
42122 // mul x, 28 => add ((mul (mul x, 9), 3), x)
42123 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
42124 case 29:
42125 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
42126 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
42127 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
42128 }
42129
42130 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
42131 // by a single LEA.
42132 // First check if this a sum of two power of 2s because that's easy. Then
42133 // count how many zeros are up to the first bit.
42134 // TODO: We can do this even without LEA at a cost of two shifts and an add.
42135 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
42136 unsigned ScaleShift = countTrailingZeros(MulAmt);
42137 if (ScaleShift >= 1 && ScaleShift < 4) {
42138 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
42139 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
42140 DAG.getConstant(ShiftAmt, DL, MVT::i8));
42141 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
42142 DAG.getConstant(ScaleShift, DL, MVT::i8));
42143 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
42144 }
42145 }
42146
42147 return SDValue();
42148}
42149
42150// If the upper 17 bits of each element are zero then we can use PMADDWD,
42151// which is always at least as quick as PMULLD, except on KNL.
42152static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
42153 const X86Subtarget &Subtarget) {
42154 if (!Subtarget.hasSSE2())
42155 return SDValue();
42156
42157 if (Subtarget.isPMADDWDSlow())
42158 return SDValue();
42159
42160 EVT VT = N->getValueType(0);
42161
42162 // Only support vXi32 vectors.
42163 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
42164 return SDValue();
42165
42166 // Make sure the type is legal or will be widened to a legal type.
42167 if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
42168 return SDValue();
42169
42170 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
42171
42172 // Without BWI, we would need to split v32i16.
42173 if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
42174 return SDValue();
42175
42176 SDValue N0 = N->getOperand(0);
42177 SDValue N1 = N->getOperand(1);
42178
42179 // If we are zero extending two steps without SSE4.1, its better to reduce
42180 // the vmul width instead.
42181 if (!Subtarget.hasSSE41() &&
42182 (N0.getOpcode() == ISD::ZERO_EXTEND &&
42183 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
42184 (N1.getOpcode() == ISD::ZERO_EXTEND &&
42185 N1.getOperand(0).getScalarValueSizeInBits() <= 8))
42186 return SDValue();
42187
42188 APInt Mask17 = APInt::getHighBitsSet(32, 17);
42189 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
42190 !DAG.MaskedValueIsZero(N0, Mask17))
42191 return SDValue();
42192
42193 // Use SplitOpsAndApply to handle AVX splitting.
42194 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
42195 ArrayRef<SDValue> Ops) {
42196 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
42197 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
42198 };
42199 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
42200 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
42201 PMADDWDBuilder);
42202}
42203
42204static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
42205 const X86Subtarget &Subtarget) {
42206 if (!Subtarget.hasSSE2())
42207 return SDValue();
42208
42209 EVT VT = N->getValueType(0);
42210
42211 // Only support vXi64 vectors.
42212 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
42213 VT.getVectorNumElements() < 2 ||
42214 !isPowerOf2_32(VT.getVectorNumElements()))
42215 return SDValue();
42216
42217 SDValue N0 = N->getOperand(0);
42218 SDValue N1 = N->getOperand(1);
42219
42220 // MULDQ returns the 64-bit result of the signed multiplication of the lower
42221 // 32-bits. We can lower with this if the sign bits stretch that far.
42222 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
42223 DAG.ComputeNumSignBits(N1) > 32) {
42224 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
42225 ArrayRef<SDValue> Ops) {
42226 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
42227 };
42228 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
42229 PMULDQBuilder, /*CheckBWI*/false);
42230 }
42231
42232 // If the upper bits are zero we can use a single pmuludq.
42233 APInt Mask = APInt::getHighBitsSet(64, 32);
42234 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
42235 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
42236 ArrayRef<SDValue> Ops) {
42237 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
42238 };
42239 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
42240 PMULUDQBuilder, /*CheckBWI*/false);
42241 }
42242
42243 return SDValue();
42244}
42245
42246/// Optimize a single multiply with constant into two operations in order to
42247/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
42248static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
42249 TargetLowering::DAGCombinerInfo &DCI,
42250 const X86Subtarget &Subtarget) {
42251 EVT VT = N->getValueType(0);
42252
42253 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
42254 return V;
42255
42256 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
42257 return V;
42258
42259 if (DCI.isBeforeLegalize() && VT.isVector())
42260 return reduceVMULWidth(N, DAG, Subtarget);
42261
42262 if (!MulConstantOptimization)
42263 return SDValue();
42264 // An imul is usually smaller than the alternative sequence.
42265 if (DAG.getMachineFunction().getFunction().hasMinSize())
42266 return SDValue();
42267
42268 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
42269 return SDValue();
42270
42271 if (VT != MVT::i64 && VT != MVT::i32)
42272 return SDValue();
42273
42274 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
42275 if (!C)
42276 return SDValue();
42277 if (isPowerOf2_64(C->getZExtValue()))
42278 return SDValue();
42279
42280 int64_t SignMulAmt = C->getSExtValue();
42281 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")((SignMulAmt != (-9223372036854775807L -1) && "Int min should have been handled!"
) ? static_cast<void> (0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42281, __PRETTY_FUNCTION__))
;
42282 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
42283
42284 SDLoc DL(N);
42285 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
42286 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
42287 DAG.getConstant(AbsMulAmt, DL, VT));
42288 if (SignMulAmt < 0)
42289 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
42290 NewMul);
42291
42292 return NewMul;
42293 }
42294
42295 uint64_t MulAmt1 = 0;
42296 uint64_t MulAmt2 = 0;
42297 if ((AbsMulAmt % 9) == 0) {
42298 MulAmt1 = 9;
42299 MulAmt2 = AbsMulAmt / 9;
42300 } else if ((AbsMulAmt % 5) == 0) {
42301 MulAmt1 = 5;
42302 MulAmt2 = AbsMulAmt / 5;
42303 } else if ((AbsMulAmt % 3) == 0) {
42304 MulAmt1 = 3;
42305 MulAmt2 = AbsMulAmt / 3;
42306 }
42307
42308 SDValue NewMul;
42309 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
42310 if (MulAmt2 &&
42311 (isPowerOf2_64(MulAmt2) ||
42312 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
42313
42314 if (isPowerOf2_64(MulAmt2) &&
42315 !(SignMulAmt >= 0 && N->hasOneUse() &&
42316 N->use_begin()->getOpcode() == ISD::ADD))
42317 // If second multiplifer is pow2, issue it first. We want the multiply by
42318 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
42319 // is an add. Only do this for positive multiply amounts since the
42320 // negate would prevent it from being used as an address mode anyway.
42321 std::swap(MulAmt1, MulAmt2);
42322
42323 if (isPowerOf2_64(MulAmt1))
42324 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
42325 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
42326 else
42327 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
42328 DAG.getConstant(MulAmt1, DL, VT));
42329
42330 if (isPowerOf2_64(MulAmt2))
42331 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
42332 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
42333 else
42334 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
42335 DAG.getConstant(MulAmt2, DL, VT));
42336
42337 // Negate the result.
42338 if (SignMulAmt < 0)
42339 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
42340 NewMul);
42341 } else if (!Subtarget.slowLEA())
42342 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
42343
42344 if (!NewMul) {
42345 assert(C->getZExtValue() != 0 &&((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42348, __PRETTY_FUNCTION__))
42346 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42348, __PRETTY_FUNCTION__))
42347 "Both cases that could cause potential overflows should have "((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42348, __PRETTY_FUNCTION__))
42348 "already been handled.")((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42348, __PRETTY_FUNCTION__))
;
42349 if (isPowerOf2_64(AbsMulAmt - 1)) {
42350 // (mul x, 2^N + 1) => (add (shl x, N), x)
42351 NewMul = DAG.getNode(
42352 ISD::ADD, DL, VT, N->getOperand(0),
42353 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
42354 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
42355 MVT::i8)));
42356 // To negate, subtract the number from zero
42357 if (SignMulAmt < 0)
42358 NewMul = DAG.getNode(ISD::SUB, DL, VT,
42359 DAG.getConstant(0, DL, VT), NewMul);
42360 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
42361 // (mul x, 2^N - 1) => (sub (shl x, N), x)
42362 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
42363 DAG.getConstant(Log2_64(AbsMulAmt + 1),
42364 DL, MVT::i8));
42365 // To negate, reverse the operands of the subtract.
42366 if (SignMulAmt < 0)
42367 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
42368 else
42369 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
42370 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
42371 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
42372 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
42373 DAG.getConstant(Log2_64(AbsMulAmt - 2),
42374 DL, MVT::i8));
42375 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
42376 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
42377 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
42378 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
42379 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
42380 DAG.getConstant(Log2_64(AbsMulAmt + 2),
42381 DL, MVT::i8));
42382 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
42383 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
42384 }
42385 }
42386
42387 return NewMul;
42388}
42389
42390// Try to form a MULHU or MULHS node by looking for
42391// (srl (mul ext, ext), 16)
42392// TODO: This is X86 specific because we want to be able to handle wide types
42393// before type legalization. But we can only do it if the vector will be
42394// legalized via widening/splitting. Type legalization can't handle promotion
42395// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
42396// combiner.
42397static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
42398 const X86Subtarget &Subtarget) {
42399 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::
SRA) && "SRL or SRA node is required here!") ? static_cast
<void> (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42400, __PRETTY_FUNCTION__))
42400 "SRL or SRA node is required here!")(((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::
SRA) && "SRL or SRA node is required here!") ? static_cast
<void> (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42400, __PRETTY_FUNCTION__))
;
42401 SDLoc DL(N);
42402
42403 // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
42404 // the multiply.
42405 if (!Subtarget.hasSSE41())
42406 return SDValue();
42407
42408 // The operation feeding into the shift must be a multiply.
42409 SDValue ShiftOperand = N->getOperand(0);
42410 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
42411 return SDValue();
42412
42413 // Input type should be at least vXi32.
42414 EVT VT = N->getValueType(0);
42415 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
42416 return SDValue();
42417
42418 // Need a shift by 16.
42419 APInt ShiftAmt;
42420 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
42421 ShiftAmt != 16)
42422 return SDValue();
42423
42424 SDValue LHS = ShiftOperand.getOperand(0);
42425 SDValue RHS = ShiftOperand.getOperand(1);
42426
42427 unsigned ExtOpc = LHS.getOpcode();
42428 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
42429 RHS.getOpcode() != ExtOpc)
42430 return SDValue();
42431
42432 // Peek through the extends.
42433 LHS = LHS.getOperand(0);
42434 RHS = RHS.getOperand(0);
42435
42436 // Ensure the input types match.
42437 EVT MulVT = LHS.getValueType();
42438 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
42439 return SDValue();
42440
42441 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
42442 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
42443
42444 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
42445 return DAG.getNode(ExtOpc, DL, VT, Mulh);
42446}
42447
42448static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
42449 SDValue N0 = N->getOperand(0);
42450 SDValue N1 = N->getOperand(1);
42451 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
42452 EVT VT = N0.getValueType();
42453
42454 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
42455 // since the result of setcc_c is all zero's or all ones.
42456 if (VT.isInteger() && !VT.isVector() &&
42457 N1C && N0.getOpcode() == ISD::AND &&
42458 N0.getOperand(1).getOpcode() == ISD::Constant) {
42459 SDValue N00 = N0.getOperand(0);
42460 APInt Mask = N0.getConstantOperandAPInt(1);
42461 Mask <<= N1C->getAPIntValue();
42462 bool MaskOK = false;
42463 // We can handle cases concerning bit-widening nodes containing setcc_c if
42464 // we carefully interrogate the mask to make sure we are semantics
42465 // preserving.
42466 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
42467 // of the underlying setcc_c operation if the setcc_c was zero extended.
42468 // Consider the following example:
42469 // zext(setcc_c) -> i32 0x0000FFFF
42470 // c1 -> i32 0x0000FFFF
42471 // c2 -> i32 0x00000001
42472 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
42473 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
42474 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
42475 MaskOK = true;
42476 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
42477 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
42478 MaskOK = true;
42479 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
42480 N00.getOpcode() == ISD::ANY_EXTEND) &&
42481 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
42482 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
42483 }
42484 if (MaskOK && Mask != 0) {
42485 SDLoc DL(N);
42486 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
42487 }
42488 }
42489
42490 // Hardware support for vector shifts is sparse which makes us scalarize the
42491 // vector operations in many cases. Also, on sandybridge ADD is faster than
42492 // shl.
42493 // (shl V, 1) -> add V,V
42494 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
42495 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
42496 assert(N0.getValueType().isVector() && "Invalid vector shift type")((N0.getValueType().isVector() && "Invalid vector shift type"
) ? static_cast<void> (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42496, __PRETTY_FUNCTION__))
;
42497 // We shift all of the values by one. In many cases we do not have
42498 // hardware support for this operation. This is better expressed as an ADD
42499 // of two values.
42500 if (N1SplatC->isOne())
42501 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
42502 }
42503
42504 return SDValue();
42505}
42506
42507static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
42508 const X86Subtarget &Subtarget) {
42509 SDValue N0 = N->getOperand(0);
42510 SDValue N1 = N->getOperand(1);
42511 EVT VT = N0.getValueType();
42512 unsigned Size = VT.getSizeInBits();
42513
42514 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
42515 return V;
42516
42517 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
42518 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
42519 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
42520 // depending on sign of (SarConst - [56,48,32,24,16])
42521
42522 // sexts in X86 are MOVs. The MOVs have the same code size
42523 // as above SHIFTs (only SHIFT on 1 has lower code size).
42524 // However the MOVs have 2 advantages to a SHIFT:
42525 // 1. MOVs can write to a register that differs from source
42526 // 2. MOVs accept memory operands
42527
42528 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
42529 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
42530 N0.getOperand(1).getOpcode() != ISD::Constant)
42531 return SDValue();
42532
42533 SDValue N00 = N0.getOperand(0);
42534 SDValue N01 = N0.getOperand(1);
42535 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
42536 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
42537 EVT CVT = N1.getValueType();
42538
42539 if (SarConst.isNegative())
42540 return SDValue();
42541
42542 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
42543 unsigned ShiftSize = SVT.getSizeInBits();
42544 // skipping types without corresponding sext/zext and
42545 // ShlConst that is not one of [56,48,32,24,16]
42546 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
42547 continue;
42548 SDLoc DL(N);
42549 SDValue NN =
42550 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
42551 SarConst = SarConst - (Size - ShiftSize);
42552 if (SarConst == 0)
42553 return NN;
42554 else if (SarConst.isNegative())
42555 return DAG.getNode(ISD::SHL, DL, VT, NN,
42556 DAG.getConstant(-SarConst, DL, CVT));
42557 else
42558 return DAG.getNode(ISD::SRA, DL, VT, NN,
42559 DAG.getConstant(SarConst, DL, CVT));
42560 }
42561 return SDValue();
42562}
42563
42564static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
42565 TargetLowering::DAGCombinerInfo &DCI,
42566 const X86Subtarget &Subtarget) {
42567 SDValue N0 = N->getOperand(0);
42568 SDValue N1 = N->getOperand(1);
42569 EVT VT = N0.getValueType();
42570
42571 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
42572 return V;
42573
42574 // Only do this on the last DAG combine as it can interfere with other
42575 // combines.
42576 if (!DCI.isAfterLegalizeDAG())
42577 return SDValue();
42578
42579 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
42580 // TODO: This is a generic DAG combine that became an x86-only combine to
42581 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
42582 // and-not ('andn').
42583 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
42584 return SDValue();
42585
42586 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
42587 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
42588 if (!ShiftC || !AndC)
42589 return SDValue();
42590
42591 // If we can shrink the constant mask below 8-bits or 32-bits, then this
42592 // transform should reduce code size. It may also enable secondary transforms
42593 // from improved known-bits analysis or instruction selection.
42594 APInt MaskVal = AndC->getAPIntValue();
42595
42596 // If this can be matched by a zero extend, don't optimize.
42597 if (MaskVal.isMask()) {
42598 unsigned TO = MaskVal.countTrailingOnes();
42599 if (TO >= 8 && isPowerOf2_32(TO))
42600 return SDValue();
42601 }
42602
42603 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
42604 unsigned OldMaskSize = MaskVal.getMinSignedBits();
42605 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
42606 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
42607 (OldMaskSize > 32 && NewMaskSize <= 32)) {
42608 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
42609 SDLoc DL(N);
42610 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
42611 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
42612 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
42613 }
42614 return SDValue();
42615}
42616
42617static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
42618 const X86Subtarget &Subtarget) {
42619 unsigned Opcode = N->getOpcode();
42620 assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode ||(((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD
::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS
== Opcode || X86ISD::PACKUS == Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42623, __PRETTY_FUNCTION__))
42621 X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode ||(((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD
::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS
== Opcode || X86ISD::PACKUS == Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42623, __PRETTY_FUNCTION__))
42622 X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD
::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS
== Opcode || X86ISD::PACKUS == Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42623, __PRETTY_FUNCTION__))
42623 "Unexpected hadd/hsub/pack opcode")(((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD
::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS
== Opcode || X86ISD::PACKUS == Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode || X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42623, __PRETTY_FUNCTION__))
;
42624
42625 EVT VT = N->getValueType(0);
42626 SDValue N0 = N->getOperand(0);
42627 SDValue N1 = N->getOperand(1);
42628 EVT SrcVT = N0.getValueType();
42629
42630 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
42631 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
42632 // truncation trees that help us avoid lane crossing shuffles.
42633 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
42634 // TODO: We don't handle vXf64 shuffles yet.
42635 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42636 N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42637 N0.getConstantOperandAPInt(1) == 0 &&
42638 N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() &&
42639 N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
42640 N0.getOperand(0).getValueType().is256BitVector() &&
42641 SrcVT.getScalarSizeInBits() <= 32) {
42642 // TODO - support target/faux shuffles.
42643 SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
42644 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
42645 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
42646 // shuffle to a vXi64 width - we can probably relax this in the future.
42647 SmallVector<int, 4> ShuffleMask;
42648 if (SVN->getOperand(1).isUndef() &&
42649 scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
42650 SDLoc DL(N);
42651 SDValue Lo, Hi;
42652 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
42653 std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
42654 Lo = DAG.getBitcast(N0.getValueType(), Lo);
42655 Hi = DAG.getBitcast(N1.getValueType(), Hi);
42656 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
42657 Res = DAG.getBitcast(ShufVT, Res);
42658 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
42659 return DAG.getBitcast(VT, Res);
42660 }
42661 }
42662 }
42663
42664 // Attempt to fold HOP(SHUFFLE(X),SHUFFLE(Y)) -> SHUFFLE(HOP(X,Y)).
42665 // TODO: Merge with binary shuffle folds below.
42666 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
42667 int PostShuffle[4] = {0, 1, 2, 3};
42668
42669 // If the op is an unary shuffle that can scale to v2x64,
42670 // then we can perform this as a v4x32 post shuffle.
42671 auto AdjustOp = [&](SDValue V, int Offset) {
42672 auto *SVN = dyn_cast<ShuffleVectorSDNode>(V);
42673 SmallVector<int, 2> ScaledMask;
42674 if (!SVN || !SVN->getOperand(1).isUndef() ||
42675 !scaleShuffleElements(SVN->getMask(), 2, ScaledMask) ||
42676 !N->isOnlyUserOf(V.getNode()))
42677 return SDValue();
42678 PostShuffle[Offset + 0] = ScaledMask[0] < 0 ? -1 : Offset + ScaledMask[0];
42679 PostShuffle[Offset + 1] = ScaledMask[1] < 0 ? -1 : Offset + ScaledMask[1];
42680 return SVN->getOperand(0);
42681 };
42682
42683 SDValue Src0 = AdjustOp(N0, 0);
42684 SDValue Src1 = AdjustOp(N1, 2);
42685 if (Src0 || Src1) {
42686 Src0 = Src0 ? Src0 : N0;
42687 Src1 = Src1 ? Src1 : N1;
42688 SDLoc DL(N);
42689 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
42690 SDValue Res = DAG.getNode(Opcode, DL, VT, Src0, Src1);
42691 Res = DAG.getBitcast(ShufVT, Res);
42692 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
42693 return DAG.getBitcast(VT, Res);
42694 }
42695 }
42696
42697 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
42698 // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
42699 if (VT.is256BitVector() && Subtarget.hasInt256()) {
42700 if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) {
42701 if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) {
42702 SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
42703 if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) &&
42704 scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) {
42705 SDValue Op00 = SVN0->getOperand(0);
42706 SDValue Op01 = SVN0->getOperand(1);
42707 SDValue Op10 = SVN1->getOperand(0);
42708 SDValue Op11 = SVN1->getOperand(1);
42709 if ((Op00 == Op11) && (Op01 == Op10)) {
42710 std::swap(Op10, Op11);
42711 ShuffleVectorSDNode::commuteMask(ShuffleMask1);
42712 }
42713 if ((Op00 == Op10) && (Op01 == Op11)) {
42714 SmallVector<int, 4> ShuffleMask;
42715 ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
42716 ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
42717 SDLoc DL(N);
42718 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
42719 SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
42720 Res = DAG.getBitcast(ShufVT, Res);
42721 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
42722 return DAG.getBitcast(VT, Res);
42723 }
42724 }
42725 }
42726 }
42727 }
42728
42729 return SDValue();
42730}
42731
42732static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
42733 TargetLowering::DAGCombinerInfo &DCI,
42734 const X86Subtarget &Subtarget) {
42735 unsigned Opcode = N->getOpcode();
42736 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected pack opcode") ? static_cast<void> (0) : __assert_fail
("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42737, __PRETTY_FUNCTION__))
42737 "Unexpected pack opcode")(((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected pack opcode") ? static_cast<void> (0) : __assert_fail
("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42737, __PRETTY_FUNCTION__))
;
42738
42739 EVT VT = N->getValueType(0);
42740 SDValue N0 = N->getOperand(0);
42741 SDValue N1 = N->getOperand(1);
42742 unsigned NumDstElts = VT.getVectorNumElements();
42743 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
42744 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
42745 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42747, __PRETTY_FUNCTION__))
42746 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42747, __PRETTY_FUNCTION__))
42747 "Unexpected PACKSS/PACKUS input type")((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42747, __PRETTY_FUNCTION__))
;
42748
42749 bool IsSigned = (X86ISD::PACKSS == Opcode);
42750
42751 // Constant Folding.
42752 APInt UndefElts0, UndefElts1;
42753 SmallVector<APInt, 32> EltBits0, EltBits1;
42754 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
42755 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
42756 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
42757 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
42758 unsigned NumLanes = VT.getSizeInBits() / 128;
42759 unsigned NumSrcElts = NumDstElts / 2;
42760 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
42761 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
42762
42763 APInt Undefs(NumDstElts, 0);
42764 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
42765 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
42766 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
42767 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
42768 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
42769 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
42770
42771 if (UndefElts[SrcIdx]) {
42772 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
42773 continue;
42774 }
42775
42776 APInt &Val = EltBits[SrcIdx];
42777 if (IsSigned) {
42778 // PACKSS: Truncate signed value with signed saturation.
42779 // Source values less than dst minint are saturated to minint.
42780 // Source values greater than dst maxint are saturated to maxint.
42781 if (Val.isSignedIntN(DstBitsPerElt))
42782 Val = Val.trunc(DstBitsPerElt);
42783 else if (Val.isNegative())
42784 Val = APInt::getSignedMinValue(DstBitsPerElt);
42785 else
42786 Val = APInt::getSignedMaxValue(DstBitsPerElt);
42787 } else {
42788 // PACKUS: Truncate signed value with unsigned saturation.
42789 // Source values less than zero are saturated to zero.
42790 // Source values greater than dst maxuint are saturated to maxuint.
42791 if (Val.isIntN(DstBitsPerElt))
42792 Val = Val.trunc(DstBitsPerElt);
42793 else if (Val.isNegative())
42794 Val = APInt::getNullValue(DstBitsPerElt);
42795 else
42796 Val = APInt::getAllOnesValue(DstBitsPerElt);
42797 }
42798 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
42799 }
42800 }
42801
42802 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
42803 }
42804
42805 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
42806 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
42807 return V;
42808
42809 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
42810 // truncate to create a larger truncate.
42811 if (Subtarget.hasAVX512() &&
42812 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
42813 N0.getOperand(0).getValueType() == MVT::v8i32) {
42814 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
42815 (!IsSigned &&
42816 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
42817 if (Subtarget.hasVLX())
42818 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
42819
42820 // Widen input to v16i32 so we can truncate that.
42821 SDLoc dl(N);
42822 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
42823 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
42824 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
42825 }
42826 }
42827
42828 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
42829 if (VT.is128BitVector()) {
42830 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
42831 SDValue Src0, Src1;
42832 if (N0.getOpcode() == ExtOpc &&
42833 N0.getOperand(0).getValueType().is64BitVector() &&
42834 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
42835 Src0 = N0.getOperand(0);
42836 }
42837 if (N1.getOpcode() == ExtOpc &&
42838 N1.getOperand(0).getValueType().is64BitVector() &&
42839 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
42840 Src1 = N1.getOperand(0);
42841 }
42842 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
42843 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)") ? static_cast
<void> (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42843, __PRETTY_FUNCTION__))
;
42844 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
42845 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
42846 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
42847 }
42848 }
42849
42850 // Attempt to combine as shuffle.
42851 SDValue Op(N, 0);
42852 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
42853 return Res;
42854
42855 return SDValue();
42856}
42857
42858static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
42859 TargetLowering::DAGCombinerInfo &DCI,
42860 const X86Subtarget &Subtarget) {
42861 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->
getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB
== N->getOpcode()) && "Unexpected horizontal add/sub opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42863, __PRETTY_FUNCTION__))
42862 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->
getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB
== N->getOpcode()) && "Unexpected horizontal add/sub opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42863, __PRETTY_FUNCTION__))
42863 "Unexpected horizontal add/sub opcode")(((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->
getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB
== N->getOpcode()) && "Unexpected horizontal add/sub opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42863, __PRETTY_FUNCTION__))
;
42864
42865 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
42866 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
42867 return V;
42868
42869 return SDValue();
42870}
42871
42872static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
42873 TargetLowering::DAGCombinerInfo &DCI,
42874 const X86Subtarget &Subtarget) {
42875 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42877, __PRETTY_FUNCTION__))
42876 X86ISD::VSRL == N->getOpcode()) &&(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42877, __PRETTY_FUNCTION__))
42877 "Unexpected shift opcode")(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42877, __PRETTY_FUNCTION__))
;
42878 EVT VT = N->getValueType(0);
42879 SDValue N0 = N->getOperand(0);
42880 SDValue N1 = N->getOperand(1);
42881
42882 // Shift zero -> zero.
42883 if (ISD::isBuildVectorAllZeros(N0.getNode()))
42884 return DAG.getConstant(0, SDLoc(N), VT);
42885
42886 // Detect constant shift amounts.
42887 APInt UndefElts;
42888 SmallVector<APInt, 32> EltBits;
42889 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
42890 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
42891 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
42892 EltBits[0].getZExtValue(), DAG);
42893 }
42894
42895 APInt KnownUndef, KnownZero;
42896 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42897 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
42898 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
42899 KnownZero, DCI))
42900 return SDValue(N, 0);
42901
42902 return SDValue();
42903}
42904
42905static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
42906 TargetLowering::DAGCombinerInfo &DCI,
42907 const X86Subtarget &Subtarget) {
42908 unsigned Opcode = N->getOpcode();
42909 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42911, __PRETTY_FUNCTION__))
42910 X86ISD::VSRLI == Opcode) &&(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42911, __PRETTY_FUNCTION__))
42911 "Unexpected shift opcode")(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42911, __PRETTY_FUNCTION__))
;
42912 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
42913 EVT VT = N->getValueType(0);
42914 SDValue N0 = N->getOperand(0);
42915 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
42916 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0
&& "Unexpected value type") ? static_cast<void>
(0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42917, __PRETTY_FUNCTION__))
42917 "Unexpected value type")((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0
&& "Unexpected value type") ? static_cast<void>
(0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42917, __PRETTY_FUNCTION__))
;
42918 assert(N->getOperand(1).getValueType() == MVT::i8 &&((N->getOperand(1).getValueType() == MVT::i8 && "Unexpected shift amount type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42919, __PRETTY_FUNCTION__))
42919 "Unexpected shift amount type")((N->getOperand(1).getValueType() == MVT::i8 && "Unexpected shift amount type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42919, __PRETTY_FUNCTION__))
;
42920
42921 // Out of range logical bit shifts are guaranteed to be zero.
42922 // Out of range arithmetic bit shifts splat the sign bit.
42923 unsigned ShiftVal = N->getConstantOperandVal(1);
42924 if (ShiftVal >= NumBitsPerElt) {
42925 if (LogicalShift)
42926 return DAG.getConstant(0, SDLoc(N), VT);
42927 ShiftVal = NumBitsPerElt - 1;
42928 }
42929
42930 // (shift X, 0) -> X
42931 if (!ShiftVal)
42932 return N0;
42933
42934 // (shift 0, C) -> 0
42935 if (ISD::isBuildVectorAllZeros(N0.getNode()))
42936 // N0 is all zeros or undef. We guarantee that the bits shifted into the
42937 // result are all zeros, not undef.
42938 return DAG.getConstant(0, SDLoc(N), VT);
42939
42940 // (VSRAI -1, C) -> -1
42941 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
42942 // N0 is all ones or undef. We guarantee that the bits shifted into the
42943 // result are all ones, not undef.
42944 return DAG.getConstant(-1, SDLoc(N), VT);
42945
42946 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
42947 if (Opcode == N0.getOpcode()) {
42948 unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
42949 unsigned NewShiftVal = ShiftVal + ShiftVal2;
42950 if (NewShiftVal >= NumBitsPerElt) {
42951 // Out of range logical bit shifts are guaranteed to be zero.
42952 // Out of range arithmetic bit shifts splat the sign bit.
42953 if (LogicalShift)
42954 return DAG.getConstant(0, SDLoc(N), VT);
42955 NewShiftVal = NumBitsPerElt - 1;
42956 }
42957 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
42958 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
42959 }
42960
42961 // We can decode 'whole byte' logical bit shifts as shuffles.
42962 if (LogicalShift && (ShiftVal % 8) == 0) {
42963 SDValue Op(N, 0);
42964 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
42965 return Res;
42966 }
42967
42968 // Constant Folding.
42969 APInt UndefElts;
42970 SmallVector<APInt, 32> EltBits;
42971 if (N->isOnlyUserOf(N0.getNode()) &&
42972 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
42973 assert(EltBits.size() == VT.getVectorNumElements() &&((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"
) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42974, __PRETTY_FUNCTION__))
42974 "Unexpected shift value type")((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"
) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42974, __PRETTY_FUNCTION__))
;
42975 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
42976 // created an undef input due to no input bits being demanded, but user
42977 // still expects 0 in other bits.
42978 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
42979 APInt &Elt = EltBits[i];
42980 if (UndefElts[i])
42981 Elt = 0;
42982 else if (X86ISD::VSHLI == Opcode)
42983 Elt <<= ShiftVal;
42984 else if (X86ISD::VSRAI == Opcode)
42985 Elt.ashrInPlace(ShiftVal);
42986 else
42987 Elt.lshrInPlace(ShiftVal);
42988 }
42989 // Reset undef elements since they were zeroed above.
42990 UndefElts = 0;
42991 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
42992 }
42993
42994 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42995 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
42996 APInt::getAllOnesValue(NumBitsPerElt), DCI))
42997 return SDValue(N, 0);
42998
42999 return SDValue();
43000}
43001
43002static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
43003 TargetLowering::DAGCombinerInfo &DCI,
43004 const X86Subtarget &Subtarget) {
43005 EVT VT = N->getValueType(0);
43006 assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? static_cast<void> (0)
: __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43009, __PRETTY_FUNCTION__))
43007 (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? static_cast<void> (0)
: __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43009, __PRETTY_FUNCTION__))
43008 N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? static_cast<void> (0)
: __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43009, __PRETTY_FUNCTION__))
43009 "Unexpected vector insertion")((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? static_cast<void> (0)
: __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43009, __PRETTY_FUNCTION__))
;
43010
43011 if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
43012 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
43013 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43014 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
43015 APInt::getAllOnesValue(NumBitsPerElt), DCI))
43016 return SDValue(N, 0);
43017 }
43018
43019 // Attempt to combine insertion patterns to a shuffle.
43020 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
43021 SDValue Op(N, 0);
43022 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43023 return Res;
43024 }
43025
43026 return SDValue();
43027}
43028
43029/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
43030/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
43031/// OR -> CMPNEQSS.
43032static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
43033 TargetLowering::DAGCombinerInfo &DCI,
43034 const X86Subtarget &Subtarget) {
43035 unsigned opcode;
43036
43037 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
43038 // we're requiring SSE2 for both.
43039 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
43040 SDValue N0 = N->getOperand(0);
43041 SDValue N1 = N->getOperand(1);
43042 SDValue CMP0 = N0.getOperand(1);
43043 SDValue CMP1 = N1.getOperand(1);
43044 SDLoc DL(N);
43045
43046 // The SETCCs should both refer to the same CMP.
43047 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
43048 return SDValue();
43049
43050 SDValue CMP00 = CMP0->getOperand(0);
43051 SDValue CMP01 = CMP0->getOperand(1);
43052 EVT VT = CMP00.getValueType();
43053
43054 if (VT == MVT::f32 || VT == MVT::f64) {
43055 bool ExpectingFlags = false;
43056 // Check for any users that want flags:
43057 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
43058 !ExpectingFlags && UI != UE; ++UI)
43059 switch (UI->getOpcode()) {
43060 default:
43061 case ISD::BR_CC:
43062 case ISD::BRCOND:
43063 case ISD::SELECT:
43064 ExpectingFlags = true;
43065 break;
43066 case ISD::CopyToReg:
43067 case ISD::SIGN_EXTEND:
43068 case ISD::ZERO_EXTEND:
43069 case ISD::ANY_EXTEND:
43070 break;
43071 }
43072
43073 if (!ExpectingFlags) {
43074 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
43075 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
43076
43077 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
43078 X86::CondCode tmp = cc0;
43079 cc0 = cc1;
43080 cc1 = tmp;
43081 }
43082
43083 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
43084 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
43085 // FIXME: need symbolic constants for these magic numbers.
43086 // See X86ATTInstPrinter.cpp:printSSECC().
43087 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
43088 if (Subtarget.hasAVX512()) {
43089 SDValue FSetCC =
43090 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
43091 DAG.getTargetConstant(x86cc, DL, MVT::i8));
43092 // Need to fill with zeros to ensure the bitcast will produce zeroes
43093 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
43094 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
43095 DAG.getConstant(0, DL, MVT::v16i1),
43096 FSetCC, DAG.getIntPtrConstant(0, DL));
43097 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
43098 N->getSimpleValueType(0));
43099 }
43100 SDValue OnesOrZeroesF =
43101 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
43102 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
43103
43104 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
43105 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
43106
43107 if (is64BitFP && !Subtarget.is64Bit()) {
43108 // On a 32-bit target, we cannot bitcast the 64-bit float to a
43109 // 64-bit integer, since that's not a legal type. Since
43110 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
43111 // bits, but can do this little dance to extract the lowest 32 bits
43112 // and work with those going forward.
43113 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
43114 OnesOrZeroesF);
43115 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
43116 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
43117 Vector32, DAG.getIntPtrConstant(0, DL));
43118 IntVT = MVT::i32;
43119 }
43120
43121 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
43122 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
43123 DAG.getConstant(1, DL, IntVT));
43124 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
43125 ANDed);
43126 return OneBitOfTruth;
43127 }
43128 }
43129 }
43130 }
43131 return SDValue();
43132}
43133
43134/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
43135static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
43136 assert(N->getOpcode() == ISD::AND)((N->getOpcode() == ISD::AND) ? static_cast<void> (0
) : __assert_fail ("N->getOpcode() == ISD::AND", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43136, __PRETTY_FUNCTION__))
;
43137
43138 MVT VT = N->getSimpleValueType(0);
43139 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
43140 return SDValue();
43141
43142 SDValue X, Y;
43143 SDValue N0 = N->getOperand(0);
43144 SDValue N1 = N->getOperand(1);
43145
43146 auto GetNot = [&VT, &DAG](SDValue V) {
43147 // Basic X = NOT(Y) detection.
43148 if (SDValue Not = IsNOT(V, DAG))
43149 return Not;
43150 // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
43151 if (V.getOpcode() == X86ISD::VBROADCAST) {
43152 SDValue Src = V.getOperand(0);
43153 EVT SrcVT = Src.getValueType();
43154 if (!SrcVT.isVector())
43155 return SDValue();
43156 if (SDValue Not = IsNOT(Src, DAG))
43157 return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
43158 DAG.getBitcast(SrcVT, Not));
43159 }
43160 return SDValue();
43161 };
43162
43163 if (SDValue Not = GetNot(N0)) {
43164 X = Not;
43165 Y = N1;
43166 } else if (SDValue Not = GetNot(N1)) {
43167 X = Not;
43168 Y = N0;
43169 } else
43170 return SDValue();
43171
43172 X = DAG.getBitcast(VT, X);
43173 Y = DAG.getBitcast(VT, Y);
43174 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
43175}
43176
43177// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
43178// logical operations, like in the example below.
43179// or (and (truncate x, truncate y)),
43180// (xor (truncate z, build_vector (constants)))
43181// Given a target type \p VT, we generate
43182// or (and x, y), (xor z, zext(build_vector (constants)))
43183// given x, y and z are of type \p VT. We can do so, if operands are either
43184// truncates from VT types, the second operand is a vector of constants or can
43185// be recursively promoted.
43186static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
43187 unsigned Depth) {
43188 // Limit recursion to avoid excessive compile times.
43189 if (Depth >= SelectionDAG::MaxRecursionDepth)
43190 return SDValue();
43191
43192 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
43193 N->getOpcode() != ISD::OR)
43194 return SDValue();
43195
43196 SDValue N0 = N->getOperand(0);
43197 SDValue N1 = N->getOperand(1);
43198 SDLoc DL(N);
43199
43200 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43201 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
43202 return SDValue();
43203
43204 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
43205 N0 = NN0;
43206 else {
43207 // The Left side has to be a trunc.
43208 if (N0.getOpcode() != ISD::TRUNCATE)
43209 return SDValue();
43210
43211 // The type of the truncated inputs.
43212 if (N0.getOperand(0).getValueType() != VT)
43213 return SDValue();
43214
43215 N0 = N0.getOperand(0);
43216 }
43217
43218 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
43219 N1 = NN1;
43220 else {
43221 // The right side has to be a 'trunc' or a constant vector.
43222 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
43223 N1.getOperand(0).getValueType() == VT;
43224 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
43225 return SDValue();
43226
43227 if (RHSTrunc)
43228 N1 = N1.getOperand(0);
43229 else
43230 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
43231 }
43232
43233 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
43234}
43235
43236// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
43237// register. In most cases we actually compare or select YMM-sized registers
43238// and mixing the two types creates horrible code. This method optimizes
43239// some of the transition sequences.
43240// Even with AVX-512 this is still useful for removing casts around logical
43241// operations on vXi1 mask types.
43242static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
43243 const X86Subtarget &Subtarget) {
43244 EVT VT = N->getValueType(0);
43245 assert(VT.isVector() && "Expected vector type")((VT.isVector() && "Expected vector type") ? static_cast
<void> (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43245, __PRETTY_FUNCTION__))
;
43246
43247 SDLoc DL(N);
43248 assert((N->getOpcode() == ISD::ANY_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43250, __PRETTY_FUNCTION__))
43249 N->getOpcode() == ISD::ZERO_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43250, __PRETTY_FUNCTION__))
43250 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43250, __PRETTY_FUNCTION__))
;
43251
43252 SDValue Narrow = N->getOperand(0);
43253 EVT NarrowVT = Narrow.getValueType();
43254
43255 // Generate the wide operation.
43256 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
43257 if (!Op)
43258 return SDValue();
43259 switch (N->getOpcode()) {
43260 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43260)
;
43261 case ISD::ANY_EXTEND:
43262 return Op;
43263 case ISD::ZERO_EXTEND:
43264 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
43265 case ISD::SIGN_EXTEND:
43266 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
43267 Op, DAG.getValueType(NarrowVT));
43268 }
43269}
43270
43271static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
43272 unsigned FPOpcode;
43273 switch (Opcode) {
43274 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43274)
;
43275 case ISD::AND: FPOpcode = X86ISD::FAND; break;
43276 case ISD::OR: FPOpcode = X86ISD::FOR; break;
43277 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
43278 }
43279 return FPOpcode;
43280}
43281
43282/// If both input operands of a logic op are being cast from floating point
43283/// types, try to convert this into a floating point logic node to avoid
43284/// unnecessary moves from SSE to integer registers.
43285static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
43286 const X86Subtarget &Subtarget) {
43287 EVT VT = N->getValueType(0);
43288 SDValue N0 = N->getOperand(0);
43289 SDValue N1 = N->getOperand(1);
43290 SDLoc DL(N);
43291
43292 if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
43293 return SDValue();
43294
43295 SDValue N00 = N0.getOperand(0);
43296 SDValue N10 = N1.getOperand(0);
43297 EVT N00Type = N00.getValueType();
43298 EVT N10Type = N10.getValueType();
43299
43300 // Ensure that both types are the same and are legal scalar fp types.
43301 if (N00Type != N10Type ||
43302 !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
43303 (Subtarget.hasSSE2() && N00Type == MVT::f64)))
43304 return SDValue();
43305
43306 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
43307 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
43308 return DAG.getBitcast(VT, FPLogic);
43309}
43310
43311// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
43312// to reduce XMM->GPR traffic.
43313static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
43314 unsigned Opc = N->getOpcode();
43315 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
"Unexpected bit opcode") ? static_cast<void> (0) : __assert_fail
("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43316, __PRETTY_FUNCTION__))
43316 "Unexpected bit opcode")(((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
"Unexpected bit opcode") ? static_cast<void> (0) : __assert_fail
("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43316, __PRETTY_FUNCTION__))
;
43317
43318 SDValue N0 = N->getOperand(0);
43319 SDValue N1 = N->getOperand(1);
43320
43321 // Both operands must be single use MOVMSK.
43322 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
43323 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
43324 return SDValue();
43325
43326 SDValue Vec0 = N0.getOperand(0);
43327 SDValue Vec1 = N1.getOperand(0);
43328 EVT VecVT0 = Vec0.getValueType();
43329 EVT VecVT1 = Vec1.getValueType();
43330
43331 // Both MOVMSK operands must be from vectors of the same size and same element
43332 // size, but its OK for a fp/int diff.
43333 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
43334 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
43335 return SDValue();
43336
43337 SDLoc DL(N);
43338 unsigned VecOpc =
43339 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
43340 SDValue Result =
43341 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
43342 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
43343}
43344
43345/// If this is a zero/all-bits result that is bitwise-anded with a low bits
43346/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
43347/// with a shift-right to eliminate loading the vector constant mask value.
43348static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
43349 const X86Subtarget &Subtarget) {
43350 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
43351 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
43352 EVT VT0 = Op0.getValueType();
43353 EVT VT1 = Op1.getValueType();
43354
43355 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
43356 return SDValue();
43357
43358 APInt SplatVal;
43359 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
43360 !SplatVal.isMask())
43361 return SDValue();
43362
43363 // Don't prevent creation of ANDN.
43364 if (isBitwiseNot(Op0))
43365 return SDValue();
43366
43367 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
43368 return SDValue();
43369
43370 unsigned EltBitWidth = VT0.getScalarSizeInBits();
43371 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
43372 return SDValue();
43373
43374 SDLoc DL(N);
43375 unsigned ShiftVal = SplatVal.countTrailingOnes();
43376 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
43377 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
43378 return DAG.getBitcast(N->getValueType(0), Shift);
43379}
43380
43381// Get the index node from the lowered DAG of a GEP IR instruction with one
43382// indexing dimension.
43383static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
43384 if (Ld->isIndexed())
43385 return SDValue();
43386
43387 SDValue Base = Ld->getBasePtr();
43388
43389 if (Base.getOpcode() != ISD::ADD)
43390 return SDValue();
43391
43392 SDValue ShiftedIndex = Base.getOperand(0);
43393
43394 if (ShiftedIndex.getOpcode() != ISD::SHL)
43395 return SDValue();
43396
43397 return ShiftedIndex.getOperand(0);
43398
43399}
43400
43401static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
43402 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
43403 switch (VT.getSizeInBits()) {
43404 default: return false;
43405 case 64: return Subtarget.is64Bit() ? true : false;
43406 case 32: return true;
43407 }
43408 }
43409 return false;
43410}
43411
43412// This function recognizes cases where X86 bzhi instruction can replace and
43413// 'and-load' sequence.
43414// In case of loading integer value from an array of constants which is defined
43415// as follows:
43416//
43417// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
43418//
43419// then applying a bitwise and on the result with another input.
43420// It's equivalent to performing bzhi (zero high bits) on the input, with the
43421// same index of the load.
43422static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
43423 const X86Subtarget &Subtarget) {
43424 MVT VT = Node->getSimpleValueType(0);
43425 SDLoc dl(Node);
43426
43427 // Check if subtarget has BZHI instruction for the node's type
43428 if (!hasBZHI(Subtarget, VT))
43429 return SDValue();
43430
43431 // Try matching the pattern for both operands.
43432 for (unsigned i = 0; i < 2; i++) {
43433 SDValue N = Node->getOperand(i);
43434 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
43435
43436 // continue if the operand is not a load instruction
43437 if (!Ld)
43438 return SDValue();
43439
43440 const Value *MemOp = Ld->getMemOperand()->getValue();
43441
43442 if (!MemOp)
43443 return SDValue();
43444
43445 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
43446 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
43447 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
43448
43449 Constant *Init = GV->getInitializer();
43450 Type *Ty = Init->getType();
43451 if (!isa<ConstantDataArray>(Init) ||
43452 !Ty->getArrayElementType()->isIntegerTy() ||
43453 Ty->getArrayElementType()->getScalarSizeInBits() !=
43454 VT.getSizeInBits() ||
43455 Ty->getArrayNumElements() >
43456 Ty->getArrayElementType()->getScalarSizeInBits())
43457 continue;
43458
43459 // Check if the array's constant elements are suitable to our case.
43460 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
43461 bool ConstantsMatch = true;
43462 for (uint64_t j = 0; j < ArrayElementCount; j++) {
43463 ConstantInt *Elem =
43464 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
43465 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
43466 ConstantsMatch = false;
43467 break;
43468 }
43469 }
43470 if (!ConstantsMatch)
43471 continue;
43472
43473 // Do the transformation (For 32-bit type):
43474 // -> (and (load arr[idx]), inp)
43475 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
43476 // that will be replaced with one bzhi instruction.
43477 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
43478 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
43479
43480 // Get the Node which indexes into the array.
43481 SDValue Index = getIndexFromUnindexedLoad(Ld);
43482 if (!Index)
43483 return SDValue();
43484 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
43485
43486 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
43487 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
43488
43489 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
43490 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
43491
43492 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
43493 }
43494 }
43495 }
43496 }
43497 return SDValue();
43498}
43499
43500// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
43501// Where C is a mask containing the same number of bits as the setcc and
43502// where the setcc will freely 0 upper bits of k-register. We can replace the
43503// undef in the concat with 0s and remove the AND. This mainly helps with
43504// v2i1/v4i1 setcc being casted to scalar.
43505static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
43506 const X86Subtarget &Subtarget) {
43507 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")((N->getOpcode() == ISD::AND && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43507, __PRETTY_FUNCTION__))
;
43508
43509 EVT VT = N->getValueType(0);
43510
43511 // Make sure this is an AND with constant. We will check the value of the
43512 // constant later.
43513 if (!isa<ConstantSDNode>(N->getOperand(1)))
43514 return SDValue();
43515
43516 // This is implied by the ConstantSDNode.
43517 assert(!VT.isVector() && "Expected scalar VT!")((!VT.isVector() && "Expected scalar VT!") ? static_cast
<void> (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43517, __PRETTY_FUNCTION__))
;
43518
43519 if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
43520 !N->getOperand(0).hasOneUse() ||
43521 !N->getOperand(0).getOperand(0).hasOneUse())
43522 return SDValue();
43523
43524 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43525 SDValue Src = N->getOperand(0).getOperand(0);
43526 EVT SrcVT = Src.getValueType();
43527 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
43528 !TLI.isTypeLegal(SrcVT))
43529 return SDValue();
43530
43531 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
43532 return SDValue();
43533
43534 // We only care about the first subvector of the concat, we expect the
43535 // other subvectors to be ignored due to the AND if we make the change.
43536 SDValue SubVec = Src.getOperand(0);
43537 EVT SubVecVT = SubVec.getValueType();
43538
43539 // First subvector should be a setcc with a legal result type. The RHS of the
43540 // AND should be a mask with this many bits.
43541 if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
43542 !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
43543 return SDValue();
43544
43545 EVT SetccVT = SubVec.getOperand(0).getValueType();
43546 if (!TLI.isTypeLegal(SetccVT) ||
43547 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
43548 return SDValue();
43549
43550 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
43551 return SDValue();
43552
43553 // We passed all the checks. Rebuild the concat_vectors with zeroes
43554 // and cast it back to VT.
43555 SDLoc dl(N);
43556 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
43557 DAG.getConstant(0, dl, SubVecVT));
43558 Ops[0] = SubVec;
43559 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
43560 Ops);
43561 return DAG.getBitcast(VT, Concat);
43562}
43563
43564static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
43565 TargetLowering::DAGCombinerInfo &DCI,
43566 const X86Subtarget &Subtarget) {
43567 EVT VT = N->getValueType(0);
43568
43569 // If this is SSE1 only convert to FAND to avoid scalarization.
43570 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
43571 return DAG.getBitcast(
43572 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
43573 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
43574 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
43575 }
43576
43577 // Use a 32-bit and+zext if upper bits known zero.
43578 if (VT == MVT::i64 && Subtarget.is64Bit() &&
43579 !isa<ConstantSDNode>(N->getOperand(1))) {
43580 APInt HiMask = APInt::getHighBitsSet(64, 32);
43581 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
43582 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
43583 SDLoc dl(N);
43584 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
43585 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
43586 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
43587 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
43588 }
43589 }
43590
43591 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
43592 // TODO: Support multiple SrcOps.
43593 if (VT == MVT::i1) {
43594 SmallVector<SDValue, 2> SrcOps;
43595 SmallVector<APInt, 2> SrcPartials;
43596 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
43597 SrcOps.size() == 1) {
43598 SDLoc dl(N);
43599 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43600 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
43601 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
43602 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
43603 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
43604 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
43605 if (Mask) {
43606 assert(SrcPartials[0].getBitWidth() == NumElts &&((SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask"
) ? static_cast<void> (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43607, __PRETTY_FUNCTION__))
43607 "Unexpected partial reduction mask")((SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask"
) ? static_cast<void> (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43607, __PRETTY_FUNCTION__))
;
43608 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
43609 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
43610 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
43611 }
43612 }
43613 }
43614
43615 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
43616 return V;
43617
43618 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
43619 return R;
43620
43621 if (DCI.isBeforeLegalizeOps())
43622 return SDValue();
43623
43624 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
43625 return R;
43626
43627 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
43628 return FPLogic;
43629
43630 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
43631 return R;
43632
43633 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
43634 return ShiftRight;
43635
43636 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
43637 return R;
43638
43639 // Attempt to recursively combine a bitmask AND with shuffles.
43640 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
43641 SDValue Op(N, 0);
43642 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43643 return Res;
43644 }
43645
43646 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
43647 if ((VT.getScalarSizeInBits() % 8) == 0 &&
43648 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
43649 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
43650 SDValue BitMask = N->getOperand(1);
43651 SDValue SrcVec = N->getOperand(0).getOperand(0);
43652 EVT SrcVecVT = SrcVec.getValueType();
43653
43654 // Check that the constant bitmask masks whole bytes.
43655 APInt UndefElts;
43656 SmallVector<APInt, 64> EltBits;
43657 if (VT == SrcVecVT.getScalarType() &&
43658 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
43659 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
43660 llvm::all_of(EltBits, [](const APInt &M) {
43661 return M.isNullValue() || M.isAllOnesValue();
43662 })) {
43663 unsigned NumElts = SrcVecVT.getVectorNumElements();
43664 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
43665 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
43666
43667 // Create a root shuffle mask from the byte mask and the extracted index.
43668 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
43669 for (unsigned i = 0; i != Scale; ++i) {
43670 if (UndefElts[i])
43671 continue;
43672 int VecIdx = Scale * Idx + i;
43673 ShuffleMask[VecIdx] =
43674 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
43675 }
43676
43677 if (SDValue Shuffle = combineX86ShufflesRecursively(
43678 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
43679 X86::MaxShuffleCombineDepth,
43680 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
43681 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
43682 N->getOperand(0).getOperand(1));
43683 }
43684 }
43685
43686 return SDValue();
43687}
43688
43689// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
43690static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
43691 const X86Subtarget &Subtarget) {
43692 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((N->getOpcode() == ISD::OR && "Unexpected Opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43692, __PRETTY_FUNCTION__))
;
43693
43694 MVT VT = N->getSimpleValueType(0);
43695 if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
43696 return SDValue();
43697
43698 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
43699 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
43700 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
43701 return SDValue();
43702
43703 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
43704 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
43705 bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
43706 Subtarget.hasVLX();
43707 if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
43708 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
43709 return SDValue();
43710
43711 // Attempt to extract constant byte masks.
43712 APInt UndefElts0, UndefElts1;
43713 SmallVector<APInt, 32> EltBits0, EltBits1;
43714 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
43715 false, false))
43716 return SDValue();
43717 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
43718 false, false))
43719 return SDValue();
43720
43721 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
43722 // TODO - add UNDEF elts support.
43723 if (UndefElts0[i] || UndefElts1[i])
43724 return SDValue();
43725 if (EltBits0[i] != ~EltBits1[i])
43726 return SDValue();
43727 }
43728
43729 SDLoc DL(N);
43730
43731 if (UseVPTERNLOG) {
43732 // Emit a VPTERNLOG node directly.
43733 SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
43734 SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
43735 SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
43736 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
43737 return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
43738 }
43739
43740 SDValue X = N->getOperand(0);
43741 SDValue Y =
43742 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
43743 DAG.getBitcast(VT, N1.getOperand(0)));
43744 return DAG.getNode(ISD::OR, DL, VT, X, Y);
43745}
43746
43747// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
43748static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
43749 if (N->getOpcode() != ISD::OR)
43750 return false;
43751
43752 SDValue N0 = N->getOperand(0);
43753 SDValue N1 = N->getOperand(1);
43754
43755 // Canonicalize AND to LHS.
43756 if (N1.getOpcode() == ISD::AND)
43757 std::swap(N0, N1);
43758
43759 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
43760 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
43761 return false;
43762
43763 Mask = N1.getOperand(0);
43764 X = N1.getOperand(1);
43765
43766 // Check to see if the mask appeared in both the AND and ANDNP.
43767 if (N0.getOperand(0) == Mask)
43768 Y = N0.getOperand(1);
43769 else if (N0.getOperand(1) == Mask)
43770 Y = N0.getOperand(0);
43771 else
43772 return false;
43773
43774 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
43775 // ANDNP combine allows other combines to happen that prevent matching.
43776 return true;
43777}
43778
43779// Try to fold:
43780// (or (and (m, y), (pandn m, x)))
43781// into:
43782// (vselect m, x, y)
43783// As a special case, try to fold:
43784// (or (and (m, (sub 0, x)), (pandn m, x)))
43785// into:
43786// (sub (xor X, M), M)
43787static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
43788 const X86Subtarget &Subtarget) {
43789 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((N->getOpcode() == ISD::OR && "Unexpected Opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43789, __PRETTY_FUNCTION__))
;
43790
43791 EVT VT = N->getValueType(0);
43792 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
43793 (VT.is256BitVector() && Subtarget.hasInt256())))
43794 return SDValue();
43795
43796 SDValue X, Y, Mask;
43797 if (!matchLogicBlend(N, X, Y, Mask))
43798 return SDValue();
43799
43800 // Validate that X, Y, and Mask are bitcasts, and see through them.
43801 Mask = peekThroughBitcasts(Mask);
43802 X = peekThroughBitcasts(X);
43803 Y = peekThroughBitcasts(Y);
43804
43805 EVT MaskVT = Mask.getValueType();
43806 unsigned EltBits = MaskVT.getScalarSizeInBits();
43807
43808 // TODO: Attempt to handle floating point cases as well?
43809 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
43810 return SDValue();
43811
43812 SDLoc DL(N);
43813
43814 // Attempt to combine to conditional negate: (sub (xor X, M), M)
43815 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
43816 DAG, Subtarget))
43817 return Res;
43818
43819 // PBLENDVB is only available on SSE 4.1.
43820 if (!Subtarget.hasSSE41())
43821 return SDValue();
43822
43823 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
43824 if (Subtarget.hasVLX())
43825 return SDValue();
43826
43827 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
43828
43829 X = DAG.getBitcast(BlendVT, X);
43830 Y = DAG.getBitcast(BlendVT, Y);
43831 Mask = DAG.getBitcast(BlendVT, Mask);
43832 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
43833 return DAG.getBitcast(VT, Mask);
43834}
43835
43836// Helper function for combineOrCmpEqZeroToCtlzSrl
43837// Transforms:
43838// seteq(cmp x, 0)
43839// into:
43840// srl(ctlz x), log2(bitsize(x))
43841// Input pattern is checked by caller.
43842static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
43843 SelectionDAG &DAG) {
43844 SDValue Cmp = Op.getOperand(1);
43845 EVT VT = Cmp.getOperand(0).getValueType();
43846 unsigned Log2b = Log2_32(VT.getSizeInBits());
43847 SDLoc dl(Op);
43848 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
43849 // The result of the shift is true or false, and on X86, the 32-bit
43850 // encoding of shr and lzcnt is more desirable.
43851 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
43852 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
43853 DAG.getConstant(Log2b, dl, MVT::i8));
43854 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
43855}
43856
43857// Try to transform:
43858// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
43859// into:
43860// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
43861// Will also attempt to match more generic cases, eg:
43862// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
43863// Only applies if the target supports the FastLZCNT feature.
43864static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
43865 TargetLowering::DAGCombinerInfo &DCI,
43866 const X86Subtarget &Subtarget) {
43867 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
43868 return SDValue();
43869
43870 auto isORCandidate = [](SDValue N) {
43871 return (N->getOpcode() == ISD::OR && N->hasOneUse());
43872 };
43873
43874 // Check the zero extend is extending to 32-bit or more. The code generated by
43875 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
43876 // instructions to clear the upper bits.
43877 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
43878 !isORCandidate(N->getOperand(0)))
43879 return SDValue();
43880
43881 // Check the node matches: setcc(eq, cmp 0)
43882 auto isSetCCCandidate = [](SDValue N) {
43883 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
43884 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
43885 N->getOperand(1).getOpcode() == X86ISD::CMP &&
43886 isNullConstant(N->getOperand(1).getOperand(1)) &&
43887 N->getOperand(1).getValueType().bitsGE(MVT::i32);
43888 };
43889
43890 SDNode *OR = N->getOperand(0).getNode();
43891 SDValue LHS = OR->getOperand(0);
43892 SDValue RHS = OR->getOperand(1);
43893
43894 // Save nodes matching or(or, setcc(eq, cmp 0)).
43895 SmallVector<SDNode *, 2> ORNodes;
43896 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
43897 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
43898 ORNodes.push_back(OR);
43899 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
43900 LHS = OR->getOperand(0);
43901 RHS = OR->getOperand(1);
43902 }
43903
43904 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
43905 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
43906 !isORCandidate(SDValue(OR, 0)))
43907 return SDValue();
43908
43909 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
43910 // to
43911 // or(srl(ctlz),srl(ctlz)).
43912 // The dag combiner can then fold it into:
43913 // srl(or(ctlz, ctlz)).
43914 EVT VT = OR->getValueType(0);
43915 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
43916 SDValue Ret, NewRHS;
43917 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
43918 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
43919
43920 if (!Ret)
43921 return SDValue();
43922
43923 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
43924 while (ORNodes.size() > 0) {
43925 OR = ORNodes.pop_back_val();
43926 LHS = OR->getOperand(0);
43927 RHS = OR->getOperand(1);
43928 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
43929 if (RHS->getOpcode() == ISD::OR)
43930 std::swap(LHS, RHS);
43931 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
43932 if (!NewRHS)
43933 return SDValue();
43934 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
43935 }
43936
43937 if (Ret)
43938 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
43939
43940 return Ret;
43941}
43942
43943static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
43944 TargetLowering::DAGCombinerInfo &DCI,
43945 const X86Subtarget &Subtarget) {
43946 SDValue N0 = N->getOperand(0);
43947 SDValue N1 = N->getOperand(1);
43948 EVT VT = N->getValueType(0);
43949
43950 // If this is SSE1 only convert to FOR to avoid scalarization.
43951 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
43952 return DAG.getBitcast(MVT::v4i32,
43953 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
43954 DAG.getBitcast(MVT::v4f32, N0),
43955 DAG.getBitcast(MVT::v4f32, N1)));
43956 }
43957
43958 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
43959 // TODO: Support multiple SrcOps.
43960 if (VT == MVT::i1) {
43961 SmallVector<SDValue, 2> SrcOps;
43962 SmallVector<APInt, 2> SrcPartials;
43963 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
43964 SrcOps.size() == 1) {
43965 SDLoc dl(N);
43966 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43967 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
43968 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
43969 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
43970 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
43971 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
43972 if (Mask) {
43973 assert(SrcPartials[0].getBitWidth() == NumElts &&((SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask"
) ? static_cast<void> (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43974, __PRETTY_FUNCTION__))
43974 "Unexpected partial reduction mask")((SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask"
) ? static_cast<void> (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43974, __PRETTY_FUNCTION__))
;
43975 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
43976 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
43977 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
43978 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
43979 }
43980 }
43981 }
43982
43983 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
43984 return R;
43985
43986 if (DCI.isBeforeLegalizeOps())
43987 return SDValue();
43988
43989 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
43990 return R;
43991
43992 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
43993 return FPLogic;
43994
43995 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
43996 return R;
43997
43998 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
43999 return R;
44000
44001 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
44002 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
44003 // iff the upper elements of the non-shifted arg are zero.
44004 // KUNPCK require 16+ bool vector elements.
44005 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
44006 unsigned NumElts = VT.getVectorNumElements();
44007 unsigned HalfElts = NumElts / 2;
44008 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
44009 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
44010 N1.getConstantOperandAPInt(1) == HalfElts &&
44011 DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
44012 SDLoc dl(N);
44013 return DAG.getNode(
44014 ISD::CONCAT_VECTORS, dl, VT,
44015 extractSubVector(N0, 0, DAG, dl, HalfElts),
44016 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
44017 }
44018 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
44019 N0.getConstantOperandAPInt(1) == HalfElts &&
44020 DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
44021 SDLoc dl(N);
44022 return DAG.getNode(
44023 ISD::CONCAT_VECTORS, dl, VT,
44024 extractSubVector(N1, 0, DAG, dl, HalfElts),
44025 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
44026 }
44027 }
44028
44029 // Attempt to recursively combine an OR of shuffles.
44030 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
44031 SDValue Op(N, 0);
44032 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44033 return Res;
44034 }
44035
44036 return SDValue();
44037}
44038
44039/// Try to turn tests against the signbit in the form of:
44040/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
44041/// into:
44042/// SETGT(X, -1)
44043static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
44044 // This is only worth doing if the output type is i8 or i1.
44045 EVT ResultType = N->getValueType(0);
44046 if (ResultType != MVT::i8 && ResultType != MVT::i1)
44047 return SDValue();
44048
44049 SDValue N0 = N->getOperand(0);
44050 SDValue N1 = N->getOperand(1);
44051
44052 // We should be performing an xor against a truncated shift.
44053 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
44054 return SDValue();
44055
44056 // Make sure we are performing an xor against one.
44057 if (!isOneConstant(N1))
44058 return SDValue();
44059
44060 // SetCC on x86 zero extends so only act on this if it's a logical shift.
44061 SDValue Shift = N0.getOperand(0);
44062 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
44063 return SDValue();
44064
44065 // Make sure we are truncating from one of i16, i32 or i64.
44066 EVT ShiftTy = Shift.getValueType();
44067 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
44068 return SDValue();
44069
44070 // Make sure the shift amount extracts the sign bit.
44071 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
44072 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
44073 return SDValue();
44074
44075 // Create a greater-than comparison against -1.
44076 // N.B. Using SETGE against 0 works but we want a canonical looking
44077 // comparison, using SETGT matches up with what TranslateX86CC.
44078 SDLoc DL(N);
44079 SDValue ShiftOp = Shift.getOperand(0);
44080 EVT ShiftOpTy = ShiftOp.getValueType();
44081 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44082 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
44083 *DAG.getContext(), ResultType);
44084 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
44085 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
44086 if (SetCCResultType != ResultType)
44087 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
44088 return Cond;
44089}
44090
44091/// Turn vector tests of the signbit in the form of:
44092/// xor (sra X, elt_size(X)-1), -1
44093/// into:
44094/// pcmpgt X, -1
44095///
44096/// This should be called before type legalization because the pattern may not
44097/// persist after that.
44098static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
44099 const X86Subtarget &Subtarget) {
44100 EVT VT = N->getValueType(0);
44101 if (!VT.isSimple())
44102 return SDValue();
44103
44104 switch (VT.getSimpleVT().SimpleTy) {
44105 default: return SDValue();
44106 case MVT::v16i8:
44107 case MVT::v8i16:
44108 case MVT::v4i32:
44109 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
44110 case MVT::v32i8:
44111 case MVT::v16i16:
44112 case MVT::v8i32:
44113 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
44114 }
44115
44116 // There must be a shift right algebraic before the xor, and the xor must be a
44117 // 'not' operation.
44118 SDValue Shift = N->getOperand(0);
44119 SDValue Ones = N->getOperand(1);
44120 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
44121 !ISD::isBuildVectorAllOnes(Ones.getNode()))
44122 return SDValue();
44123
44124 // The shift should be smearing the sign bit across each vector element.
44125 auto *ShiftAmt =
44126 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
44127 if (!ShiftAmt ||
44128 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
44129 return SDValue();
44130
44131 // Create a greater-than comparison against -1. We don't use the more obvious
44132 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
44133 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
44134}
44135
44136/// Detect patterns of truncation with unsigned saturation:
44137///
44138/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
44139/// Return the source value x to be truncated or SDValue() if the pattern was
44140/// not matched.
44141///
44142/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
44143/// where C1 >= 0 and C2 is unsigned max of destination type.
44144///
44145/// (truncate (smax (smin (x, C2), C1)) to dest_type)
44146/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
44147///
44148/// These two patterns are equivalent to:
44149/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
44150/// So return the smax(x, C1) value to be truncated or SDValue() if the
44151/// pattern was not matched.
44152static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
44153 const SDLoc &DL) {
44154 EVT InVT = In.getValueType();
44155
44156 // Saturation with truncation. We truncate from InVT to VT.
44157 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&((InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation") ? static_cast<
void> (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44158, __PRETTY_FUNCTION__))
44158 "Unexpected types for truncate operation")((InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation") ? static_cast<
void> (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44158, __PRETTY_FUNCTION__))
;
44159
44160 // Match min/max and return limit value as a parameter.
44161 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
44162 if (V.getOpcode() == Opcode &&
44163 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
44164 return V.getOperand(0);
44165 return SDValue();
44166 };
44167
44168 APInt C1, C2;
44169 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
44170 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
44171 // the element size of the destination type.
44172 if (C2.isMask(VT.getScalarSizeInBits()))
44173 return UMin;
44174
44175 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
44176 if (MatchMinMax(SMin, ISD::SMAX, C1))
44177 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
44178 return SMin;
44179
44180 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
44181 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
44182 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
44183 C2.uge(C1)) {
44184 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
44185 }
44186
44187 return SDValue();
44188}
44189
44190/// Detect patterns of truncation with signed saturation:
44191/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
44192/// signed_max_of_dest_type)) to dest_type)
44193/// or:
44194/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
44195/// signed_min_of_dest_type)) to dest_type).
44196/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
44197/// Return the source value to be truncated or SDValue() if the pattern was not
44198/// matched.
44199static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
44200 unsigned NumDstBits = VT.getScalarSizeInBits();
44201 unsigned NumSrcBits = In.getScalarValueSizeInBits();
44202 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")((NumSrcBits > NumDstBits && "Unexpected types for truncate operation"
) ? static_cast<void> (0) : __assert_fail ("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44202, __PRETTY_FUNCTION__))
;
44203
44204 auto MatchMinMax = [](SDValue V, unsigned Opcode,
44205 const APInt &Limit) -> SDValue {
44206 APInt C;
44207 if (V.getOpcode() == Opcode &&
44208 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
44209 return V.getOperand(0);
44210 return SDValue();
44211 };
44212
44213 APInt SignedMax, SignedMin;
44214 if (MatchPackUS) {
44215 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
44216 SignedMin = APInt(NumSrcBits, 0);
44217 } else {
44218 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
44219 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
44220 }
44221
44222 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
44223 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
44224 return SMax;
44225
44226 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
44227 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
44228 return SMin;
44229
44230 return SDValue();
44231}
44232
44233static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
44234 SelectionDAG &DAG,
44235 const X86Subtarget &Subtarget) {
44236 if (!Subtarget.hasSSE2() || !VT.isVector())
44237 return SDValue();
44238
44239 EVT SVT = VT.getVectorElementType();
44240 EVT InVT = In.getValueType();
44241 EVT InSVT = InVT.getVectorElementType();
44242
44243 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
44244 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
44245 // and concatenate at the same time. Then we can use a final vpmovuswb to
44246 // clip to 0-255.
44247 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
44248 InVT == MVT::v16i32 && VT == MVT::v16i8) {
44249 if (auto USatVal = detectSSatPattern(In, VT, true)) {
44250 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
44251 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
44252 DL, DAG, Subtarget);
44253 assert(Mid && "Failed to pack!")((Mid && "Failed to pack!") ? static_cast<void>
(0) : __assert_fail ("Mid && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44253, __PRETTY_FUNCTION__))
;
44254 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
44255 }
44256 }
44257
44258 // vXi32 truncate instructions are available with AVX512F.
44259 // vXi16 truncate instructions are only available with AVX512BW.
44260 // For 256-bit or smaller vectors, we require VLX.
44261 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
44262 // If the result type is 256-bits or larger and we have disable 512-bit
44263 // registers, we should go ahead and use the pack instructions if possible.
44264 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
44265 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
44266 (InVT.getSizeInBits() > 128) &&
44267 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
44268 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
44269
44270 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
44271 VT.getSizeInBits() >= 64 &&
44272 (SVT == MVT::i8 || SVT == MVT::i16) &&
44273 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
44274 if (auto USatVal = detectSSatPattern(In, VT, true)) {
44275 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
44276 // Only do this when the result is at least 64 bits or we'll leaving
44277 // dangling PACKSSDW nodes.
44278 if (SVT == MVT::i8 && InSVT == MVT::i32) {
44279 EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
44280 VT.getVectorNumElements());
44281 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
44282 DAG, Subtarget);
44283 assert(Mid && "Failed to pack!")((Mid && "Failed to pack!") ? static_cast<void>
(0) : __assert_fail ("Mid && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44283, __PRETTY_FUNCTION__))
;
44284 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
44285 Subtarget);
44286 assert(V && "Failed to pack!")((V && "Failed to pack!") ? static_cast<void> (
0) : __assert_fail ("V && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44286, __PRETTY_FUNCTION__))
;
44287 return V;
44288 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
44289 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
44290 Subtarget);
44291 }
44292 if (auto SSatVal = detectSSatPattern(In, VT))
44293 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
44294 Subtarget);
44295 }
44296
44297 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44298 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
44299 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
44300 unsigned TruncOpc = 0;
44301 SDValue SatVal;
44302 if (auto SSatVal = detectSSatPattern(In, VT)) {
44303 SatVal = SSatVal;
44304 TruncOpc = X86ISD::VTRUNCS;
44305 } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
44306 SatVal = USatVal;
44307 TruncOpc = X86ISD::VTRUNCUS;
44308 }
44309 if (SatVal) {
44310 unsigned ResElts = VT.getVectorNumElements();
44311 // If the input type is less than 512 bits and we don't have VLX, we need
44312 // to widen to 512 bits.
44313 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
44314 unsigned NumConcats = 512 / InVT.getSizeInBits();
44315 ResElts *= NumConcats;
44316 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
44317 ConcatOps[0] = SatVal;
44318 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
44319 NumConcats * InVT.getVectorNumElements());
44320 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
44321 }
44322 // Widen the result if its narrower than 128 bits.
44323 if (ResElts * SVT.getSizeInBits() < 128)
44324 ResElts = 128 / SVT.getSizeInBits();
44325 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
44326 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
44327 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
44328 DAG.getIntPtrConstant(0, DL));
44329 }
44330 }
44331
44332 return SDValue();
44333}
44334
44335/// This function detects the AVG pattern between vectors of unsigned i8/i16,
44336/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
44337/// X86ISD::AVG instruction.
44338static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
44339 const X86Subtarget &Subtarget,
44340 const SDLoc &DL) {
44341 if (!VT.isVector())
44342 return SDValue();
44343 EVT InVT = In.getValueType();
44344 unsigned NumElems = VT.getVectorNumElements();
44345
44346 EVT ScalarVT = VT.getVectorElementType();
44347 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
44348 return SDValue();
44349
44350 // InScalarVT is the intermediate type in AVG pattern and it should be greater
44351 // than the original input type (i8/i16).
44352 EVT InScalarVT = InVT.getVectorElementType();
44353 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
44354 return SDValue();
44355
44356 if (!Subtarget.hasSSE2())
44357 return SDValue();
44358
44359 // Detect the following pattern:
44360 //
44361 // %1 = zext <N x i8> %a to <N x i32>
44362 // %2 = zext <N x i8> %b to <N x i32>
44363 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
44364 // %4 = add nuw nsw <N x i32> %3, %2
44365 // %5 = lshr <N x i32> %N, <i32 1 x N>
44366 // %6 = trunc <N x i32> %5 to <N x i8>
44367 //
44368 // In AVX512, the last instruction can also be a trunc store.
44369 if (In.getOpcode() != ISD::SRL)
44370 return SDValue();
44371
44372 // A lambda checking the given SDValue is a constant vector and each element
44373 // is in the range [Min, Max].
44374 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
44375 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
44376 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
44377 });
44378 };
44379
44380 // Check if each element of the vector is right-shifted by one.
44381 auto LHS = In.getOperand(0);
44382 auto RHS = In.getOperand(1);
44383 if (!IsConstVectorInRange(RHS, 1, 1))
44384 return SDValue();
44385 if (LHS.getOpcode() != ISD::ADD)
44386 return SDValue();
44387
44388 // Detect a pattern of a + b + 1 where the order doesn't matter.
44389 SDValue Operands[3];
44390 Operands[0] = LHS.getOperand(0);
44391 Operands[1] = LHS.getOperand(1);
44392
44393 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44394 ArrayRef<SDValue> Ops) {
44395 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
44396 };
44397
44398 auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
44399 // Pad to a power-of-2 vector, split+apply and extract the original vector.
44400 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
44401 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
44402 if (NumElemsPow2 != NumElems) {
44403 SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
44404 SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
44405 for (unsigned i = 0; i != NumElems; ++i) {
44406 SDValue Idx = DAG.getIntPtrConstant(i, DL);
44407 Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
44408 Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
44409 }
44410 Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
44411 Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
44412 }
44413 SDValue Res =
44414 SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
44415 if (NumElemsPow2 == NumElems)
44416 return Res;
44417 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
44418 DAG.getIntPtrConstant(0, DL));
44419 };
44420
44421 // Take care of the case when one of the operands is a constant vector whose
44422 // element is in the range [1, 256].
44423 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
44424 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
44425 Operands[0].getOperand(0).getValueType() == VT) {
44426 // The pattern is detected. Subtract one from the constant vector, then
44427 // demote it and emit X86ISD::AVG instruction.
44428 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
44429 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
44430 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
44431 return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
44432 }
44433
44434 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
44435 // Match the or case only if its 'add-like' - can be replaced by an add.
44436 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
44437 if (ISD::ADD == V.getOpcode()) {
44438 Op0 = V.getOperand(0);
44439 Op1 = V.getOperand(1);
44440 return true;
44441 }
44442 if (ISD::ZERO_EXTEND != V.getOpcode())
44443 return false;
44444 V = V.getOperand(0);
44445 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
44446 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
44447 return false;
44448 Op0 = V.getOperand(0);
44449 Op1 = V.getOperand(1);
44450 return true;
44451 };
44452
44453 SDValue Op0, Op1;
44454 if (FindAddLike(Operands[0], Op0, Op1))
44455 std::swap(Operands[0], Operands[1]);
44456 else if (!FindAddLike(Operands[1], Op0, Op1))
44457 return SDValue();
44458 Operands[2] = Op0;
44459 Operands[1] = Op1;
44460
44461 // Now we have three operands of two additions. Check that one of them is a
44462 // constant vector with ones, and the other two can be promoted from i8/i16.
44463 for (int i = 0; i < 3; ++i) {
44464 if (!IsConstVectorInRange(Operands[i], 1, 1))
44465 continue;
44466 std::swap(Operands[i], Operands[2]);
44467
44468 // Check if Operands[0] and Operands[1] are results of type promotion.
44469 for (int j = 0; j < 2; ++j)
44470 if (Operands[j].getValueType() != VT) {
44471 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
44472 Operands[j].getOperand(0).getValueType() != VT)
44473 return SDValue();
44474 Operands[j] = Operands[j].getOperand(0);
44475 }
44476
44477 // The pattern is detected, emit X86ISD::AVG instruction(s).
44478 return AVGSplitter(Operands[0], Operands[1]);
44479 }
44480
44481 return SDValue();
44482}
44483
44484static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
44485 TargetLowering::DAGCombinerInfo &DCI,
44486 const X86Subtarget &Subtarget) {
44487 LoadSDNode *Ld = cast<LoadSDNode>(N);
44488 EVT RegVT = Ld->getValueType(0);
44489 EVT MemVT = Ld->getMemoryVT();
44490 SDLoc dl(Ld);
44491 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44492
44493 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
44494 // into two 16-byte operations. Also split non-temporal aligned loads on
44495 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
44496 ISD::LoadExtType Ext = Ld->getExtensionType();
44497 bool Fast;
44498 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
44499 Ext == ISD::NON_EXTLOAD &&
44500 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
44501 Ld->getAlignment() >= 16) ||
44502 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
44503 *Ld->getMemOperand(), &Fast) &&
44504 !Fast))) {
44505 unsigned NumElems = RegVT.getVectorNumElements();
44506 if (NumElems < 2)
44507 return SDValue();
44508
44509 unsigned HalfOffset = 16;
44510 SDValue Ptr1 = Ld->getBasePtr();
44511 SDValue Ptr2 =
44512 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
44513 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
44514 NumElems / 2);
44515 SDValue Load1 =
44516 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
44517 Ld->getOriginalAlign(),
44518 Ld->getMemOperand()->getFlags());
44519 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
44520 Ld->getPointerInfo().getWithOffset(HalfOffset),
44521 Ld->getOriginalAlign(),
44522 Ld->getMemOperand()->getFlags());
44523 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
44524 Load1.getValue(1), Load2.getValue(1));
44525
44526 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
44527 return DCI.CombineTo(N, NewVec, TF, true);
44528 }
44529
44530 // Bool vector load - attempt to cast to an integer, as we have good
44531 // (vXiY *ext(vXi1 bitcast(iX))) handling.
44532 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
44533 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
44534 unsigned NumElts = RegVT.getVectorNumElements();
44535 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
44536 if (TLI.isTypeLegal(IntVT)) {
44537 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
44538 Ld->getPointerInfo(),
44539 Ld->getOriginalAlign(),
44540 Ld->getMemOperand()->getFlags());
44541 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
44542 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
44543 }
44544 }
44545
44546 // Cast ptr32 and ptr64 pointers to the default address space before a load.
44547 unsigned AddrSpace = Ld->getAddressSpace();
44548 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
44549 AddrSpace == X86AS::PTR32_UPTR) {
44550 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
44551 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
44552 SDValue Cast =
44553 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
44554 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
44555 Ld->getOriginalAlign(),
44556 Ld->getMemOperand()->getFlags());
44557 }
44558 }
44559
44560 return SDValue();
44561}
44562
44563/// If V is a build vector of boolean constants and exactly one of those
44564/// constants is true, return the operand index of that true element.
44565/// Otherwise, return -1.
44566static int getOneTrueElt(SDValue V) {
44567 // This needs to be a build vector of booleans.
44568 // TODO: Checking for the i1 type matches the IR definition for the mask,
44569 // but the mask check could be loosened to i8 or other types. That might
44570 // also require checking more than 'allOnesValue'; eg, the x86 HW
44571 // instructions only require that the MSB is set for each mask element.
44572 // The ISD::MSTORE comments/definition do not specify how the mask operand
44573 // is formatted.
44574 auto *BV = dyn_cast<BuildVectorSDNode>(V);
44575 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
44576 return -1;
44577
44578 int TrueIndex = -1;
44579 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
44580 for (unsigned i = 0; i < NumElts; ++i) {
44581 const SDValue &Op = BV->getOperand(i);
44582 if (Op.isUndef())
44583 continue;
44584 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
44585 if (!ConstNode)
44586 return -1;
44587 if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
44588 // If we already found a one, this is too many.
44589 if (TrueIndex >= 0)
44590 return -1;
44591 TrueIndex = i;
44592 }
44593 }
44594 return TrueIndex;
44595}
44596
44597/// Given a masked memory load/store operation, return true if it has one mask
44598/// bit set. If it has one mask bit set, then also return the memory address of
44599/// the scalar element to load/store, the vector index to insert/extract that
44600/// scalar element, and the alignment for the scalar memory access.
44601static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
44602 SelectionDAG &DAG, SDValue &Addr,
44603 SDValue &Index, Align &Alignment,
44604 unsigned &Offset) {
44605 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
44606 if (TrueMaskElt < 0)
44607 return false;
44608
44609 // Get the address of the one scalar element that is specified by the mask
44610 // using the appropriate offset from the base pointer.
44611 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
44612 Offset = 0;
44613 Addr = MaskedOp->getBasePtr();
44614 if (TrueMaskElt != 0) {
44615 Offset = TrueMaskElt * EltVT.getStoreSize();
44616 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
44617 SDLoc(MaskedOp));
44618 }
44619
44620 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
44621 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
44622 EltVT.getStoreSize());
44623 return true;
44624}
44625
44626/// If exactly one element of the mask is set for a non-extending masked load,
44627/// it is a scalar load and vector insert.
44628/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
44629/// mask have already been optimized in IR, so we don't bother with those here.
44630static SDValue
44631reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
44632 TargetLowering::DAGCombinerInfo &DCI,
44633 const X86Subtarget &Subtarget) {
44634 assert(ML->isUnindexed() && "Unexpected indexed masked load!")((ML->isUnindexed() && "Unexpected indexed masked load!"
) ? static_cast<void> (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44634, __PRETTY_FUNCTION__))
;
44635 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
44636 // However, some target hooks may need to be added to know when the transform
44637 // is profitable. Endianness would also have to be considered.
44638
44639 SDValue Addr, VecIndex;
44640 Align Alignment;
44641 unsigned Offset;
44642 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
44643 return SDValue();
44644
44645 // Load the one scalar element that is specified by the mask using the
44646 // appropriate offset from the base pointer.
44647 SDLoc DL(ML);
44648 EVT VT = ML->getValueType(0);
44649 EVT EltVT = VT.getVectorElementType();
44650
44651 EVT CastVT = VT;
44652 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
44653 EltVT = MVT::f64;
44654 CastVT =
44655 EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
44656 }
44657
44658 SDValue Load =
44659 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
44660 ML->getPointerInfo().getWithOffset(Offset),
44661 Alignment, ML->getMemOperand()->getFlags());
44662
44663 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
44664
44665 // Insert the loaded element into the appropriate place in the vector.
44666 SDValue Insert =
44667 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
44668 Insert = DAG.getBitcast(VT, Insert);
44669 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
44670}
44671
44672static SDValue
44673combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
44674 TargetLowering::DAGCombinerInfo &DCI) {
44675 assert(ML->isUnindexed() && "Unexpected indexed masked load!")((ML->isUnindexed() && "Unexpected indexed masked load!"
) ? static_cast<void> (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44675, __PRETTY_FUNCTION__))
;
44676 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
44677 return SDValue();
44678
44679 SDLoc DL(ML);
44680 EVT VT = ML->getValueType(0);
44681
44682 // If we are loading the first and last elements of a vector, it is safe and
44683 // always faster to load the whole vector. Replace the masked load with a
44684 // vector load and select.
44685 unsigned NumElts = VT.getVectorNumElements();
44686 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
44687 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
44688 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
44689 if (LoadFirstElt && LoadLastElt) {
44690 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
44691 ML->getMemOperand());
44692 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
44693 ML->getPassThru());
44694 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
44695 }
44696
44697 // Convert a masked load with a constant mask into a masked load and a select.
44698 // This allows the select operation to use a faster kind of select instruction
44699 // (for example, vblendvps -> vblendps).
44700
44701 // Don't try this if the pass-through operand is already undefined. That would
44702 // cause an infinite loop because that's what we're about to create.
44703 if (ML->getPassThru().isUndef())
44704 return SDValue();
44705
44706 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
44707 return SDValue();
44708
44709 // The new masked load has an undef pass-through operand. The select uses the
44710 // original pass-through operand.
44711 SDValue NewML = DAG.getMaskedLoad(
44712 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
44713 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
44714 ML->getAddressingMode(), ML->getExtensionType());
44715 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
44716 ML->getPassThru());
44717
44718 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
44719}
44720
44721static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
44722 TargetLowering::DAGCombinerInfo &DCI,
44723 const X86Subtarget &Subtarget) {
44724 auto *Mld = cast<MaskedLoadSDNode>(N);
44725
44726 // TODO: Expanding load with constant mask may be optimized as well.
44727 if (Mld->isExpandingLoad())
44728 return SDValue();
44729
44730 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
44731 if (SDValue ScalarLoad =
44732 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
44733 return ScalarLoad;
44734
44735 // TODO: Do some AVX512 subsets benefit from this transform?
44736 if (!Subtarget.hasAVX512())
44737 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
44738 return Blend;
44739 }
44740
44741 // If the mask value has been legalized to a non-boolean vector, try to
44742 // simplify ops leading up to it. We only demand the MSB of each lane.
44743 SDValue Mask = Mld->getMask();
44744 if (Mask.getScalarValueSizeInBits() != 1) {
44745 EVT VT = Mld->getValueType(0);
44746 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44747 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
44748 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
44749 if (N->getOpcode() != ISD::DELETED_NODE)
44750 DCI.AddToWorklist(N);
44751 return SDValue(N, 0);
44752 }
44753 if (SDValue NewMask =
44754 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
44755 return DAG.getMaskedLoad(
44756 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
44757 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
44758 Mld->getAddressingMode(), Mld->getExtensionType());
44759 }
44760
44761 return SDValue();
44762}
44763
44764/// If exactly one element of the mask is set for a non-truncating masked store,
44765/// it is a vector extract and scalar store.
44766/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
44767/// mask have already been optimized in IR, so we don't bother with those here.
44768static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
44769 SelectionDAG &DAG,
44770 const X86Subtarget &Subtarget) {
44771 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
44772 // However, some target hooks may need to be added to know when the transform
44773 // is profitable. Endianness would also have to be considered.
44774
44775 SDValue Addr, VecIndex;
44776 Align Alignment;
44777 unsigned Offset;
44778 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
44779 return SDValue();
44780
44781 // Extract the one scalar element that is actually being stored.
44782 SDLoc DL(MS);
44783 SDValue Value = MS->getValue();
44784 EVT VT = Value.getValueType();
44785 EVT EltVT = VT.getVectorElementType();
44786 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
44787 EltVT = MVT::f64;
44788 EVT CastVT =
44789 EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
44790 Value = DAG.getBitcast(CastVT, Value);
44791 }
44792 SDValue Extract =
44793 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
44794
44795 // Store that element at the appropriate offset from the base pointer.
44796 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
44797 MS->getPointerInfo().getWithOffset(Offset),
44798 Alignment, MS->getMemOperand()->getFlags());
44799}
44800
44801static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
44802 TargetLowering::DAGCombinerInfo &DCI,
44803 const X86Subtarget &Subtarget) {
44804 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
44805 if (Mst->isCompressingStore())
44806 return SDValue();
44807
44808 EVT VT = Mst->getValue().getValueType();
44809 SDLoc dl(Mst);
44810 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44811
44812 if (Mst->isTruncatingStore())
44813 return SDValue();
44814
44815 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
44816 return ScalarStore;
44817
44818 // If the mask value has been legalized to a non-boolean vector, try to
44819 // simplify ops leading up to it. We only demand the MSB of each lane.
44820 SDValue Mask = Mst->getMask();
44821 if (Mask.getScalarValueSizeInBits() != 1) {
44822 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
44823 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
44824 if (N->getOpcode() != ISD::DELETED_NODE)
44825 DCI.AddToWorklist(N);
44826 return SDValue(N, 0);
44827 }
44828 if (SDValue NewMask =
44829 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
44830 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
44831 Mst->getBasePtr(), Mst->getOffset(), NewMask,
44832 Mst->getMemoryVT(), Mst->getMemOperand(),
44833 Mst->getAddressingMode());
44834 }
44835
44836 SDValue Value = Mst->getValue();
44837 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
44838 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
44839 Mst->getMemoryVT())) {
44840 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
44841 Mst->getBasePtr(), Mst->getOffset(), Mask,
44842 Mst->getMemoryVT(), Mst->getMemOperand(),
44843 Mst->getAddressingMode(), true);
44844 }
44845
44846 return SDValue();
44847}
44848
44849static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
44850 TargetLowering::DAGCombinerInfo &DCI,
44851 const X86Subtarget &Subtarget) {
44852 StoreSDNode *St = cast<StoreSDNode>(N);
44853 EVT StVT = St->getMemoryVT();
44854 SDLoc dl(St);
44855 SDValue StoredVal = St->getValue();
44856 EVT VT = StoredVal.getValueType();
44857 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44858
44859 // Convert a store of vXi1 into a store of iX and a bitcast.
44860 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
44861 VT.getVectorElementType() == MVT::i1) {
44862
44863 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
44864 StoredVal = DAG.getBitcast(NewVT, StoredVal);
44865
44866 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
44867 St->getPointerInfo(), St->getOriginalAlign(),
44868 St->getMemOperand()->getFlags());
44869 }
44870
44871 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
44872 // This will avoid a copy to k-register.
44873 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
44874 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
44875 StoredVal.getOperand(0).getValueType() == MVT::i8) {
44876 SDValue Val = StoredVal.getOperand(0);
44877 // We must store zeros to the unused bits.
44878 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
44879 return DAG.getStore(St->getChain(), dl, Val,
44880 St->getBasePtr(), St->getPointerInfo(),
44881 St->getOriginalAlign(),
44882 St->getMemOperand()->getFlags());
44883 }
44884
44885 // Widen v2i1/v4i1 stores to v8i1.
44886 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
44887 Subtarget.hasAVX512()) {
44888 unsigned NumConcats = 8 / VT.getVectorNumElements();
44889 // We must store zeros to the unused bits.
44890 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
44891 Ops[0] = StoredVal;
44892 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
44893 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
44894 St->getPointerInfo(), St->getOriginalAlign(),
44895 St->getMemOperand()->getFlags());
44896 }
44897
44898 // Turn vXi1 stores of constants into a scalar store.
44899 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
44900 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
44901 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
44902 // If its a v64i1 store without 64-bit support, we need two stores.
44903 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
44904 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
44905 StoredVal->ops().slice(0, 32));
44906 Lo = combinevXi1ConstantToInteger(Lo, DAG);
44907 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
44908 StoredVal->ops().slice(32, 32));
44909 Hi = combinevXi1ConstantToInteger(Hi, DAG);
44910
44911 SDValue Ptr0 = St->getBasePtr();
44912 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
44913
44914 SDValue Ch0 =
44915 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
44916 St->getOriginalAlign(),
44917 St->getMemOperand()->getFlags());
44918 SDValue Ch1 =
44919 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
44920 St->getPointerInfo().getWithOffset(4),
44921 St->getOriginalAlign(),
44922 St->getMemOperand()->getFlags());
44923 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
44924 }
44925
44926 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
44927 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
44928 St->getPointerInfo(), St->getOriginalAlign(),
44929 St->getMemOperand()->getFlags());
44930 }
44931
44932 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
44933 // Sandy Bridge, perform two 16-byte stores.
44934 bool Fast;
44935 if (VT.is256BitVector() && StVT == VT &&
44936 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
44937 *St->getMemOperand(), &Fast) &&
44938 !Fast) {
44939 unsigned NumElems = VT.getVectorNumElements();
44940 if (NumElems < 2)
44941 return SDValue();
44942
44943 return splitVectorStore(St, DAG);
44944 }
44945
44946 // Split under-aligned vector non-temporal stores.
44947 if (St->isNonTemporal() && StVT == VT &&
44948 St->getAlignment() < VT.getStoreSize()) {
44949 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
44950 // vectors or the legalizer can scalarize it to use MOVNTI.
44951 if (VT.is256BitVector() || VT.is512BitVector()) {
44952 unsigned NumElems = VT.getVectorNumElements();
44953 if (NumElems < 2)
44954 return SDValue();
44955 return splitVectorStore(St, DAG);
44956 }
44957
44958 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
44959 // to use MOVNTI.
44960 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
44961 MVT NTVT = Subtarget.hasSSE4A()
44962 ? MVT::v2f64
44963 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
44964 return scalarizeVectorStore(St, NTVT, DAG);
44965 }
44966 }
44967
44968 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
44969 // supported, but avx512f is by extending to v16i32 and truncating.
44970 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
44971 St->getValue().getOpcode() == ISD::TRUNCATE &&
44972 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
44973 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
44974 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
44975 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
44976 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
44977 MVT::v16i8, St->getMemOperand());
44978 }
44979
44980 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
44981 if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
44982 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
44983 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
44984 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
44985 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
44986 return EmitTruncSStore(IsSigned, St->getChain(),
44987 dl, StoredVal.getOperand(0), St->getBasePtr(),
44988 VT, St->getMemOperand(), DAG);
44989 }
44990
44991 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
44992 if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
44993 auto IsExtractedElement = [](SDValue V) {
44994 if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
44995 V = V.getOperand(0);
44996 unsigned Opc = V.getOpcode();
44997 if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
44998 if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
44999 return V.getOperand(0);
45000 }
45001 return SDValue();
45002 };
45003 if (SDValue Extract = IsExtractedElement(StoredVal)) {
45004 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
45005 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
45006 SDValue Src = Trunc.getOperand(0);
45007 MVT DstVT = Trunc.getSimpleValueType();
45008 MVT SrcVT = Src.getSimpleValueType();
45009 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45010 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
45011 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
45012 if (NumTruncBits == VT.getSizeInBits() &&
45013 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
45014 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
45015 TruncVT, St->getMemOperand());
45016 }
45017 }
45018 }
45019 }
45020
45021 // Optimize trunc store (of multiple scalars) to shuffle and store.
45022 // First, pack all of the elements in one place. Next, store to memory
45023 // in fewer chunks.
45024 if (St->isTruncatingStore() && VT.isVector()) {
45025 // Check if we can detect an AVG pattern from the truncation. If yes,
45026 // replace the trunc store by a normal store with the result of X86ISD::AVG
45027 // instruction.
45028 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
45029 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
45030 Subtarget, dl))
45031 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
45032 St->getPointerInfo(), St->getOriginalAlign(),
45033 St->getMemOperand()->getFlags());
45034
45035 if (TLI.isTruncStoreLegal(VT, StVT)) {
45036 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
45037 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
45038 dl, Val, St->getBasePtr(),
45039 St->getMemoryVT(), St->getMemOperand(), DAG);
45040 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
45041 DAG, dl))
45042 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
45043 dl, Val, St->getBasePtr(),
45044 St->getMemoryVT(), St->getMemOperand(), DAG);
45045 }
45046
45047 return SDValue();
45048 }
45049
45050 // Cast ptr32 and ptr64 pointers to the default address space before a store.
45051 unsigned AddrSpace = St->getAddressSpace();
45052 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
45053 AddrSpace == X86AS::PTR32_UPTR) {
45054 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
45055 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
45056 SDValue Cast =
45057 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
45058 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
45059 St->getPointerInfo(), St->getOriginalAlign(),
45060 St->getMemOperand()->getFlags(), St->getAAInfo());
45061 }
45062 }
45063
45064 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
45065 // the FP state in cases where an emms may be missing.
45066 // A preferable solution to the general problem is to figure out the right
45067 // places to insert EMMS. This qualifies as a quick hack.
45068
45069 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
45070 if (VT.getSizeInBits() != 64)
45071 return SDValue();
45072
45073 const Function &F = DAG.getMachineFunction().getFunction();
45074 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
45075 bool F64IsLegal =
45076 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
45077 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
45078 isa<LoadSDNode>(St->getValue()) &&
45079 cast<LoadSDNode>(St->getValue())->isSimple() &&
45080 St->getChain().hasOneUse() && St->isSimple()) {
45081 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
45082
45083 if (!ISD::isNormalLoad(Ld))
45084 return SDValue();
45085
45086 // Avoid the transformation if there are multiple uses of the loaded value.
45087 if (!Ld->hasNUsesOfValue(1, 0))
45088 return SDValue();
45089
45090 SDLoc LdDL(Ld);
45091 SDLoc StDL(N);
45092 // Lower to a single movq load/store pair.
45093 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
45094 Ld->getBasePtr(), Ld->getMemOperand());
45095
45096 // Make sure new load is placed in same chain order.
45097 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
45098 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
45099 St->getMemOperand());
45100 }
45101
45102 // This is similar to the above case, but here we handle a scalar 64-bit
45103 // integer store that is extracted from a vector on a 32-bit target.
45104 // If we have SSE2, then we can treat it like a floating-point double
45105 // to get past legalization. The execution dependencies fixup pass will
45106 // choose the optimal machine instruction for the store if this really is
45107 // an integer or v2f32 rather than an f64.
45108 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
45109 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
45110 SDValue OldExtract = St->getOperand(1);
45111 SDValue ExtOp0 = OldExtract.getOperand(0);
45112 unsigned VecSize = ExtOp0.getValueSizeInBits();
45113 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
45114 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
45115 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
45116 BitCast, OldExtract.getOperand(1));
45117 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
45118 St->getPointerInfo(), St->getOriginalAlign(),
45119 St->getMemOperand()->getFlags());
45120 }
45121
45122 return SDValue();
45123}
45124
45125static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
45126 TargetLowering::DAGCombinerInfo &DCI,
45127 const X86Subtarget &Subtarget) {
45128 auto *St = cast<MemIntrinsicSDNode>(N);
45129
45130 SDValue StoredVal = N->getOperand(1);
45131 MVT VT = StoredVal.getSimpleValueType();
45132 EVT MemVT = St->getMemoryVT();
45133
45134 // Figure out which elements we demand.
45135 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
45136 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
45137
45138 APInt KnownUndef, KnownZero;
45139 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45140 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
45141 KnownZero, DCI)) {
45142 if (N->getOpcode() != ISD::DELETED_NODE)
45143 DCI.AddToWorklist(N);
45144 return SDValue(N, 0);
45145 }
45146
45147 return SDValue();
45148}
45149
45150/// Return 'true' if this vector operation is "horizontal"
45151/// and return the operands for the horizontal operation in LHS and RHS. A
45152/// horizontal operation performs the binary operation on successive elements
45153/// of its first operand, then on successive elements of its second operand,
45154/// returning the resulting values in a vector. For example, if
45155/// A = < float a0, float a1, float a2, float a3 >
45156/// and
45157/// B = < float b0, float b1, float b2, float b3 >
45158/// then the result of doing a horizontal operation on A and B is
45159/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
45160/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
45161/// A horizontal-op B, for some already available A and B, and if so then LHS is
45162/// set to A, RHS to B, and the routine returns 'true'.
45163static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
45164 const X86Subtarget &Subtarget, bool IsCommutative,
45165 SmallVectorImpl<int> &PostShuffleMask) {
45166 // If either operand is undef, bail out. The binop should be simplified.
45167 if (LHS.isUndef() || RHS.isUndef())
45168 return false;
45169
45170 // Look for the following pattern:
45171 // A = < float a0, float a1, float a2, float a3 >
45172 // B = < float b0, float b1, float b2, float b3 >
45173 // and
45174 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
45175 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
45176 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
45177 // which is A horizontal-op B.
45178
45179 MVT VT = LHS.getSimpleValueType();
45180 assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45181, __PRETTY_FUNCTION__))
45181 "Unsupported vector type for horizontal add/sub")(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45181, __PRETTY_FUNCTION__))
;
45182 unsigned NumElts = VT.getVectorNumElements();
45183
45184 // TODO - can we make a general helper method that does all of this for us?
45185 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
45186 SmallVectorImpl<int> &ShuffleMask) {
45187 if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
45188 if (!Op.getOperand(0).isUndef())
45189 N0 = Op.getOperand(0);
45190 if (!Op.getOperand(1).isUndef())
45191 N1 = Op.getOperand(1);
45192 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
45193 ShuffleMask.append(Mask.begin(), Mask.end());
45194 return;
45195 }
45196 bool UseSubVector = false;
45197 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45198 Op.getOperand(0).getValueType().is256BitVector() &&
45199 llvm::isNullConstant(Op.getOperand(1))) {
45200 Op = Op.getOperand(0);
45201 UseSubVector = true;
45202 }
45203 bool IsUnary;
45204 SmallVector<SDValue, 2> SrcOps;
45205 SmallVector<int, 16> SrcShuffleMask;
45206 SDValue BC = peekThroughBitcasts(Op);
45207 if (isTargetShuffle(BC.getOpcode()) &&
45208 getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
45209 SrcOps, SrcShuffleMask, IsUnary)) {
45210 if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
45211 SrcOps.size() <= 2) {
45212 N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
45213 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
45214 ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
45215 }
45216 if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
45217 SrcOps.size() == 1) {
45218 N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
45219 N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
45220 ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
45221 ShuffleMask.append(Mask.begin(), Mask.end());
45222 }
45223 }
45224 };
45225
45226 // View LHS in the form
45227 // LHS = VECTOR_SHUFFLE A, B, LMask
45228 // If LHS is not a shuffle, then pretend it is the identity shuffle:
45229 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
45230 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
45231 SDValue A, B;
45232 SmallVector<int, 16> LMask;
45233 GetShuffle(LHS, A, B, LMask);
45234
45235 // Likewise, view RHS in the form
45236 // RHS = VECTOR_SHUFFLE C, D, RMask
45237 SDValue C, D;
45238 SmallVector<int, 16> RMask;
45239 GetShuffle(RHS, C, D, RMask);
45240
45241 // At least one of the operands should be a vector shuffle.
45242 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
45243 if (NumShuffles == 0)
45244 return false;
45245
45246 if (LMask.empty()) {
45247 A = LHS;
45248 for (unsigned i = 0; i != NumElts; ++i)
45249 LMask.push_back(i);
45250 }
45251
45252 if (RMask.empty()) {
45253 C = RHS;
45254 for (unsigned i = 0; i != NumElts; ++i)
45255 RMask.push_back(i);
45256 }
45257
45258 // If A and B occur in reverse order in RHS, then canonicalize by commuting
45259 // RHS operands and shuffle mask.
45260 if (A != C) {
45261 std::swap(C, D);
45262 ShuffleVectorSDNode::commuteMask(RMask);
45263 }
45264 // Check that the shuffles are both shuffling the same vectors.
45265 if (!(A == C && B == D))
45266 return false;
45267
45268 PostShuffleMask.clear();
45269 PostShuffleMask.append(NumElts, SM_SentinelUndef);
45270
45271 // LHS and RHS are now:
45272 // LHS = shuffle A, B, LMask
45273 // RHS = shuffle A, B, RMask
45274 // Check that the masks correspond to performing a horizontal operation.
45275 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
45276 // so we just repeat the inner loop if this is a 256-bit op.
45277 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
45278 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
45279 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
45280 assert((NumEltsPer128BitChunk % 2 == 0) &&(((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45281, __PRETTY_FUNCTION__))
45281 "Vector type should have an even number of elements in each lane")(((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45281, __PRETTY_FUNCTION__))
;
45282 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
45283 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
45284 // Ignore undefined components.
45285 int LIdx = LMask[i + j], RIdx = RMask[i + j];
45286 if (LIdx < 0 || RIdx < 0 ||
45287 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
45288 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
45289 continue;
45290
45291 // Check that successive odd/even elements are being operated on. If not,
45292 // this is not a horizontal operation.
45293 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
45294 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
45295 return false;
45296
45297 // Compute the post-shuffle mask index based on where the element
45298 // is stored in the HOP result, and where it needs to be moved to.
45299 int Base = LIdx & ~1u;
45300 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
45301 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
45302
45303 // The low half of the 128-bit result must choose from A.
45304 // The high half of the 128-bit result must choose from B,
45305 // unless B is undef. In that case, we are always choosing from A.
45306 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
45307 Index += NumEltsPer64BitChunk;
45308 PostShuffleMask[i + j] = Index;
45309 }
45310 }
45311
45312 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
45313 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
45314
45315 bool IsIdentityPostShuffle =
45316 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
45317 if (IsIdentityPostShuffle)
45318 PostShuffleMask.clear();
45319
45320 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
45321 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
45322 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
45323 return false;
45324
45325 // Assume a SingleSource HOP if we only shuffle one input and don't need to
45326 // shuffle the result.
45327 if (!shouldUseHorizontalOp(NewLHS == NewRHS &&
45328 (NumShuffles < 2 || !IsIdentityPostShuffle),
45329 DAG, Subtarget))
45330 return false;
45331
45332 LHS = DAG.getBitcast(VT, NewLHS);
45333 RHS = DAG.getBitcast(VT, NewRHS);
45334 return true;
45335}
45336
45337/// Do target-specific dag combines on floating-point adds/subs.
45338static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
45339 const X86Subtarget &Subtarget) {
45340 EVT VT = N->getValueType(0);
45341 SDValue LHS = N->getOperand(0);
45342 SDValue RHS = N->getOperand(1);
45343 bool IsFadd = N->getOpcode() == ISD::FADD;
45344 auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
45345 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode")(((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("(IsFadd || N->getOpcode() == ISD::FSUB) && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45345, __PRETTY_FUNCTION__))
;
45346
45347 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
45348 SmallVector<int, 8> PostShuffleMask;
45349 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
45350 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
45351 isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
45352 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
45353 if (!PostShuffleMask.empty())
45354 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
45355 DAG.getUNDEF(VT), PostShuffleMask);
45356 return HorizBinOp;
45357 }
45358
45359 return SDValue();
45360}
45361
45362/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
45363/// the codegen.
45364/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
45365/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
45366/// anything that is guaranteed to be transformed by DAGCombiner.
45367static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
45368 const X86Subtarget &Subtarget,
45369 const SDLoc &DL) {
45370 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")((N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45370, __PRETTY_FUNCTION__))
;
45371 SDValue Src = N->getOperand(0);
45372 unsigned SrcOpcode = Src.getOpcode();
45373 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45374
45375 EVT VT = N->getValueType(0);
45376 EVT SrcVT = Src.getValueType();
45377
45378 auto IsFreeTruncation = [VT](SDValue Op) {
45379 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
45380
45381 // See if this has been extended from a smaller/equal size to
45382 // the truncation size, allowing a truncation to combine with the extend.
45383 unsigned Opcode = Op.getOpcode();
45384 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
45385 Opcode == ISD::ZERO_EXTEND) &&
45386 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
45387 return true;
45388
45389 // See if this is a single use constant which can be constant folded.
45390 // NOTE: We don't peek throught bitcasts here because there is currently
45391 // no support for constant folding truncate+bitcast+vector_of_constants. So
45392 // we'll just send up with a truncate on both operands which will
45393 // get turned back into (truncate (binop)) causing an infinite loop.
45394 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
45395 };
45396
45397 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
45398 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
45399 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
45400 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
45401 };
45402
45403 // Don't combine if the operation has other uses.
45404 if (!Src.hasOneUse())
45405 return SDValue();
45406
45407 // Only support vector truncation for now.
45408 // TODO: i64 scalar math would benefit as well.
45409 if (!VT.isVector())
45410 return SDValue();
45411
45412 // In most cases its only worth pre-truncating if we're only facing the cost
45413 // of one truncation.
45414 // i.e. if one of the inputs will constant fold or the input is repeated.
45415 switch (SrcOpcode) {
45416 case ISD::MUL:
45417 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
45418 // better to truncate if we have the chance.
45419 if (SrcVT.getScalarType() == MVT::i64 &&
45420 TLI.isOperationLegal(SrcOpcode, VT) &&
45421 !TLI.isOperationLegal(SrcOpcode, SrcVT))
45422 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
45423 LLVM_FALLTHROUGH[[gnu::fallthrough]];
45424 case ISD::AND:
45425 case ISD::XOR:
45426 case ISD::OR:
45427 case ISD::ADD:
45428 case ISD::SUB: {
45429 SDValue Op0 = Src.getOperand(0);
45430 SDValue Op1 = Src.getOperand(1);
45431 if (TLI.isOperationLegal(SrcOpcode, VT) &&
45432 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
45433 return TruncateArithmetic(Op0, Op1);
45434 break;
45435 }
45436 }
45437
45438 return SDValue();
45439}
45440
45441/// Truncate using ISD::AND mask and X86ISD::PACKUS.
45442/// e.g. trunc <8 x i32> X to <8 x i16> -->
45443/// MaskX = X & 0xffff (clear high bits to prevent saturation)
45444/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
45445static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
45446 const X86Subtarget &Subtarget,
45447 SelectionDAG &DAG) {
45448 SDValue In = N->getOperand(0);
45449 EVT InVT = In.getValueType();
45450 EVT OutVT = N->getValueType(0);
45451
45452 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
45453 OutVT.getScalarSizeInBits());
45454 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
45455 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
45456}
45457
45458/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
45459static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
45460 const X86Subtarget &Subtarget,
45461 SelectionDAG &DAG) {
45462 SDValue In = N->getOperand(0);
45463 EVT InVT = In.getValueType();
45464 EVT OutVT = N->getValueType(0);
45465 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
45466 DAG.getValueType(OutVT));
45467 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
45468}
45469
45470/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
45471/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
45472/// legalization the truncation will be translated into a BUILD_VECTOR with each
45473/// element that is extracted from a vector and then truncated, and it is
45474/// difficult to do this optimization based on them.
45475static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
45476 const X86Subtarget &Subtarget) {
45477 EVT OutVT = N->getValueType(0);
45478 if (!OutVT.isVector())
45479 return SDValue();
45480
45481 SDValue In = N->getOperand(0);
45482 if (!In.getValueType().isSimple())
45483 return SDValue();
45484
45485 EVT InVT = In.getValueType();
45486 unsigned NumElems = OutVT.getVectorNumElements();
45487
45488 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
45489 // SSE2, and we need to take care of it specially.
45490 // AVX512 provides vpmovdb.
45491 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
45492 return SDValue();
45493
45494 EVT OutSVT = OutVT.getVectorElementType();
45495 EVT InSVT = InVT.getVectorElementType();
45496 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
45497 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
45498 NumElems >= 8))
45499 return SDValue();
45500
45501 // SSSE3's pshufb results in less instructions in the cases below.
45502 if (Subtarget.hasSSSE3() && NumElems == 8 &&
45503 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
45504 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
45505 return SDValue();
45506
45507 SDLoc DL(N);
45508 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
45509 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
45510 // truncate 2 x v4i32 to v8i16.
45511 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
45512 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
45513 if (InSVT == MVT::i32)
45514 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
45515
45516 return SDValue();
45517}
45518
45519/// This function transforms vector truncation of 'extended sign-bits' or
45520/// 'extended zero-bits' values.
45521/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
45522static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
45523 SelectionDAG &DAG,
45524 const X86Subtarget &Subtarget) {
45525 // Requires SSE2.
45526 if (!Subtarget.hasSSE2())
45527 return SDValue();
45528
45529 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
45530 return SDValue();
45531
45532 SDValue In = N->getOperand(0);
45533 if (!In.getValueType().isSimple())
45534 return SDValue();
45535
45536 MVT VT = N->getValueType(0).getSimpleVT();
45537 MVT SVT = VT.getScalarType();
45538
45539 MVT InVT = In.getValueType().getSimpleVT();
45540 MVT InSVT = InVT.getScalarType();
45541
45542 // Check we have a truncation suited for PACKSS/PACKUS.
45543 if (!isPowerOf2_32(VT.getVectorNumElements()))
45544 return SDValue();
45545 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
45546 return SDValue();
45547 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
45548 return SDValue();
45549
45550 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
45551 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
45552 return SDValue();
45553
45554 // AVX512 has fast truncate, but if the input is already going to be split,
45555 // there's no harm in trying pack.
45556 if (Subtarget.hasAVX512() &&
45557 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
45558 InVT.is512BitVector()))
45559 return SDValue();
45560
45561 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
45562 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
45563
45564 // Use PACKUS if the input has zero-bits that extend all the way to the
45565 // packed/truncated value. e.g. masks, zext_in_reg, etc.
45566 KnownBits Known = DAG.computeKnownBits(In);
45567 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
45568 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
45569 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
45570
45571 // Use PACKSS if the input has sign-bits that extend all the way to the
45572 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
45573 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
45574
45575 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
45576 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
45577 // on and combines/simplifications can't then use it.
45578 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
45579 return SDValue();
45580
45581 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
45582 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
45583
45584 return SDValue();
45585}
45586
45587// Try to form a MULHU or MULHS node by looking for
45588// (trunc (srl (mul ext, ext), 16))
45589// TODO: This is X86 specific because we want to be able to handle wide types
45590// before type legalization. But we can only do it if the vector will be
45591// legalized via widening/splitting. Type legalization can't handle promotion
45592// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
45593// combiner.
45594static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
45595 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
45596 // First instruction should be a right shift of a multiply.
45597 if (Src.getOpcode() != ISD::SRL ||
45598 Src.getOperand(0).getOpcode() != ISD::MUL)
45599 return SDValue();
45600
45601 if (!Subtarget.hasSSE2())
45602 return SDValue();
45603
45604 // Only handle vXi16 types that are at least 128-bits unless they will be
45605 // widened.
45606 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
45607 return SDValue();
45608
45609 // Input type should be at least vXi32.
45610 EVT InVT = Src.getValueType();
45611 if (InVT.getVectorElementType().getSizeInBits() < 32)
45612 return SDValue();
45613
45614 // Need a shift by 16.
45615 APInt ShiftAmt;
45616 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
45617 ShiftAmt != 16)
45618 return SDValue();
45619
45620 SDValue LHS = Src.getOperand(0).getOperand(0);
45621 SDValue RHS = Src.getOperand(0).getOperand(1);
45622
45623 unsigned ExtOpc = LHS.getOpcode();
45624 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
45625 RHS.getOpcode() != ExtOpc)
45626 return SDValue();
45627
45628 // Peek through the extends.
45629 LHS = LHS.getOperand(0);
45630 RHS = RHS.getOperand(0);
45631
45632 // Ensure the input types match.
45633 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
45634 return SDValue();
45635
45636 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
45637 return DAG.getNode(Opc, DL, VT, LHS, RHS);
45638}
45639
45640// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
45641// from one vector with signed bytes from another vector, adds together
45642// adjacent pairs of 16-bit products, and saturates the result before
45643// truncating to 16-bits.
45644//
45645// Which looks something like this:
45646// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
45647// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
45648static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
45649 const X86Subtarget &Subtarget,
45650 const SDLoc &DL) {
45651 if (!VT.isVector() || !Subtarget.hasSSSE3())
45652 return SDValue();
45653
45654 unsigned NumElems = VT.getVectorNumElements();
45655 EVT ScalarVT = VT.getVectorElementType();
45656 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
45657 return SDValue();
45658
45659 SDValue SSatVal = detectSSatPattern(In, VT);
45660 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
45661 return SDValue();
45662
45663 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
45664 // of multiplies from even/odd elements.
45665 SDValue N0 = SSatVal.getOperand(0);
45666 SDValue N1 = SSatVal.getOperand(1);
45667
45668 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
45669 return SDValue();
45670
45671 SDValue N00 = N0.getOperand(0);
45672 SDValue N01 = N0.getOperand(1);
45673 SDValue N10 = N1.getOperand(0);
45674 SDValue N11 = N1.getOperand(1);
45675
45676 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
45677 // Canonicalize zero_extend to LHS.
45678 if (N01.getOpcode() == ISD::ZERO_EXTEND)
45679 std::swap(N00, N01);
45680 if (N11.getOpcode() == ISD::ZERO_EXTEND)
45681 std::swap(N10, N11);
45682
45683 // Ensure we have a zero_extend and a sign_extend.
45684 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
45685 N01.getOpcode() != ISD::SIGN_EXTEND ||
45686 N10.getOpcode() != ISD::ZERO_EXTEND ||
45687 N11.getOpcode() != ISD::SIGN_EXTEND)
45688 return SDValue();
45689
45690 // Peek through the extends.
45691 N00 = N00.getOperand(0);
45692 N01 = N01.getOperand(0);
45693 N10 = N10.getOperand(0);
45694 N11 = N11.getOperand(0);
45695
45696 // Ensure the extend is from vXi8.
45697 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
45698 N01.getValueType().getVectorElementType() != MVT::i8 ||
45699 N10.getValueType().getVectorElementType() != MVT::i8 ||
45700 N11.getValueType().getVectorElementType() != MVT::i8)
45701 return SDValue();
45702
45703 // All inputs should be build_vectors.
45704 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
45705 N01.getOpcode() != ISD::BUILD_VECTOR ||
45706 N10.getOpcode() != ISD::BUILD_VECTOR ||
45707 N11.getOpcode() != ISD::BUILD_VECTOR)
45708 return SDValue();
45709
45710 // N00/N10 are zero extended. N01/N11 are sign extended.
45711
45712 // For each element, we need to ensure we have an odd element from one vector
45713 // multiplied by the odd element of another vector and the even element from
45714 // one of the same vectors being multiplied by the even element from the
45715 // other vector. So we need to make sure for each element i, this operator
45716 // is being performed:
45717 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
45718 SDValue ZExtIn, SExtIn;
45719 for (unsigned i = 0; i != NumElems; ++i) {
45720 SDValue N00Elt = N00.getOperand(i);
45721 SDValue N01Elt = N01.getOperand(i);
45722 SDValue N10Elt = N10.getOperand(i);
45723 SDValue N11Elt = N11.getOperand(i);
45724 // TODO: Be more tolerant to undefs.
45725 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45726 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45727 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45728 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
45729 return SDValue();
45730 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
45731 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
45732 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
45733 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
45734 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
45735 return SDValue();
45736 unsigned IdxN00 = ConstN00Elt->getZExtValue();
45737 unsigned IdxN01 = ConstN01Elt->getZExtValue();
45738 unsigned IdxN10 = ConstN10Elt->getZExtValue();
45739 unsigned IdxN11 = ConstN11Elt->getZExtValue();
45740 // Add is commutative so indices can be reordered.
45741 if (IdxN00 > IdxN10) {
45742 std::swap(IdxN00, IdxN10);
45743 std::swap(IdxN01, IdxN11);
45744 }
45745 // N0 indices be the even element. N1 indices must be the next odd element.
45746 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
45747 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
45748 return SDValue();
45749 SDValue N00In = N00Elt.getOperand(0);
45750 SDValue N01In = N01Elt.getOperand(0);
45751 SDValue N10In = N10Elt.getOperand(0);
45752 SDValue N11In = N11Elt.getOperand(0);
45753 // First time we find an input capture it.
45754 if (!ZExtIn) {
45755 ZExtIn = N00In;
45756 SExtIn = N01In;
45757 }
45758 if (ZExtIn != N00In || SExtIn != N01In ||
45759 ZExtIn != N10In || SExtIn != N11In)
45760 return SDValue();
45761 }
45762
45763 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45764 ArrayRef<SDValue> Ops) {
45765 // Shrink by adding truncate nodes and let DAGCombine fold with the
45766 // sources.
45767 EVT InVT = Ops[0].getValueType();
45768 assert(InVT.getScalarType() == MVT::i8 &&((InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45769, __PRETTY_FUNCTION__))
45769 "Unexpected scalar element type")((InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45769, __PRETTY_FUNCTION__))
;
45770 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((InVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45770, __PRETTY_FUNCTION__))
;
45771 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
45772 InVT.getVectorNumElements() / 2);
45773 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
45774 };
45775 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
45776 PMADDBuilder);
45777}
45778
45779static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
45780 const X86Subtarget &Subtarget) {
45781 EVT VT = N->getValueType(0);
45782 SDValue Src = N->getOperand(0);
45783 SDLoc DL(N);
45784
45785 // Attempt to pre-truncate inputs to arithmetic ops instead.
45786 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
45787 return V;
45788
45789 // Try to detect AVG pattern first.
45790 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
45791 return Avg;
45792
45793 // Try to detect PMADD
45794 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
45795 return PMAdd;
45796
45797 // Try to combine truncation with signed/unsigned saturation.
45798 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
45799 return Val;
45800
45801 // Try to combine PMULHUW/PMULHW for vXi16.
45802 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
45803 return V;
45804
45805 // The bitcast source is a direct mmx result.
45806 // Detect bitcasts between i32 to x86mmx
45807 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
45808 SDValue BCSrc = Src.getOperand(0);
45809 if (BCSrc.getValueType() == MVT::x86mmx)
45810 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
45811 }
45812
45813 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
45814 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
45815 return V;
45816
45817 return combineVectorTruncation(N, DAG, Subtarget);
45818}
45819
45820static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
45821 TargetLowering::DAGCombinerInfo &DCI) {
45822 EVT VT = N->getValueType(0);
45823 SDValue In = N->getOperand(0);
45824 SDLoc DL(N);
45825
45826 if (auto SSatVal = detectSSatPattern(In, VT))
45827 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
45828 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
45829 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
45830
45831 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45832 APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
45833 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
45834 return SDValue(N, 0);
45835
45836 return SDValue();
45837}
45838
45839/// Returns the negated value if the node \p N flips sign of FP value.
45840///
45841/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
45842/// or FSUB(0, x)
45843/// AVX512F does not have FXOR, so FNEG is lowered as
45844/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
45845/// In this case we go though all bitcasts.
45846/// This also recognizes splat of a negated value and returns the splat of that
45847/// value.
45848static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
45849 if (N->getOpcode() == ISD::FNEG)
45850 return N->getOperand(0);
45851
45852 // Don't recurse exponentially.
45853 if (Depth > SelectionDAG::MaxRecursionDepth)
45854 return SDValue();
45855
45856 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
45857
45858 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
45859 EVT VT = Op->getValueType(0);
45860
45861 // Make sure the element size doesn't change.
45862 if (VT.getScalarSizeInBits() != ScalarSize)
45863 return SDValue();
45864
45865 unsigned Opc = Op.getOpcode();
45866 switch (Opc) {
45867 case ISD::VECTOR_SHUFFLE: {
45868 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
45869 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
45870 if (!Op.getOperand(1).isUndef())
45871 return SDValue();
45872 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
45873 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
45874 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
45875 cast<ShuffleVectorSDNode>(Op)->getMask());
45876 break;
45877 }
45878 case ISD::INSERT_VECTOR_ELT: {
45879 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
45880 // -V, INDEX).
45881 SDValue InsVector = Op.getOperand(0);
45882 SDValue InsVal = Op.getOperand(1);
45883 if (!InsVector.isUndef())
45884 return SDValue();
45885 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
45886 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
45887 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
45888 NegInsVal, Op.getOperand(2));
45889 break;
45890 }
45891 case ISD::FSUB:
45892 case ISD::XOR:
45893 case X86ISD::FXOR: {
45894 SDValue Op1 = Op.getOperand(1);
45895 SDValue Op0 = Op.getOperand(0);
45896
45897 // For XOR and FXOR, we want to check if constant
45898 // bits of Op1 are sign bit masks. For FSUB, we
45899 // have to check if constant bits of Op0 are sign
45900 // bit masks and hence we swap the operands.
45901 if (Opc == ISD::FSUB)
45902 std::swap(Op0, Op1);
45903
45904 APInt UndefElts;
45905 SmallVector<APInt, 16> EltBits;
45906 // Extract constant bits and see if they are all
45907 // sign bit masks. Ignore the undef elements.
45908 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
45909 /* AllowWholeUndefs */ true,
45910 /* AllowPartialUndefs */ false)) {
45911 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
45912 if (!UndefElts[I] && !EltBits[I].isSignMask())
45913 return SDValue();
45914
45915 return peekThroughBitcasts(Op0);
45916 }
45917 }
45918 }
45919
45920 return SDValue();
45921}
45922
45923static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
45924 bool NegRes) {
45925 if (NegMul) {
45926 switch (Opcode) {
45927 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45927)
;
45928 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
45929 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
45930 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
45931 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
45932 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
45933 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
45934 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
45935 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
45936 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
45937 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
45938 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
45939 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
45940 }
45941 }
45942
45943 if (NegAcc) {
45944 switch (Opcode) {
45945 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45945)
;
45946 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
45947 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
45948 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
45949 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
45950 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
45951 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
45952 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
45953 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
45954 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
45955 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
45956 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
45957 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
45958 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
45959 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
45960 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
45961 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
45962 }
45963 }
45964
45965 if (NegRes) {
45966 switch (Opcode) {
45967 // For accuracy reason, we never combine fneg and fma under strict FP.
45968 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45968)
;
45969 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
45970 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
45971 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
45972 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
45973 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
45974 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
45975 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
45976 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
45977 }
45978 }
45979
45980 return Opcode;
45981}
45982
45983/// Do target-specific dag combines on floating point negations.
45984static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
45985 TargetLowering::DAGCombinerInfo &DCI,
45986 const X86Subtarget &Subtarget) {
45987 EVT OrigVT = N->getValueType(0);
45988 SDValue Arg = isFNEG(DAG, N);
45989 if (!Arg)
45990 return SDValue();
45991
45992 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45993 EVT VT = Arg.getValueType();
45994 EVT SVT = VT.getScalarType();
45995 SDLoc DL(N);
45996
45997 // Let legalize expand this if it isn't a legal type yet.
45998 if (!TLI.isTypeLegal(VT))
45999 return SDValue();
46000
46001 // If we're negating a FMUL node on a target with FMA, then we can avoid the
46002 // use of a constant by performing (-0 - A*B) instead.
46003 // FIXME: Check rounding control flags as well once it becomes available.
46004 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
46005 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
46006 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
46007 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
46008 Arg.getOperand(1), Zero);
46009 return DAG.getBitcast(OrigVT, NewNode);
46010 }
46011
46012 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
46013 bool LegalOperations = !DCI.isBeforeLegalizeOps();
46014 if (SDValue NegArg =
46015 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
46016 return DAG.getBitcast(OrigVT, NegArg);
46017
46018 return SDValue();
46019}
46020
46021SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
46022 bool LegalOperations,
46023 bool ForCodeSize,
46024 NegatibleCost &Cost,
46025 unsigned Depth) const {
46026 // fneg patterns are removable even if they have multiple uses.
46027 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
46028 Cost = NegatibleCost::Cheaper;
46029 return DAG.getBitcast(Op.getValueType(), Arg);
46030 }
46031
46032 EVT VT = Op.getValueType();
46033 EVT SVT = VT.getScalarType();
46034 unsigned Opc = Op.getOpcode();
46035 switch (Opc) {
46036 case ISD::FMA:
46037 case X86ISD::FMSUB:
46038 case X86ISD::FNMADD:
46039 case X86ISD::FNMSUB:
46040 case X86ISD::FMADD_RND:
46041 case X86ISD::FMSUB_RND:
46042 case X86ISD::FNMADD_RND:
46043 case X86ISD::FNMSUB_RND: {
46044 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
46045 !(SVT == MVT::f32 || SVT == MVT::f64) ||
46046 !isOperationLegal(ISD::FMA, VT))
46047 break;
46048
46049 // This is always negatible for free but we might be able to remove some
46050 // extra operand negations as well.
46051 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
46052 for (int i = 0; i != 3; ++i)
46053 NewOps[i] = getCheaperNegatedExpression(
46054 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
46055
46056 bool NegA = !!NewOps[0];
46057 bool NegB = !!NewOps[1];
46058 bool NegC = !!NewOps[2];
46059 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
46060
46061 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
46062 : NegatibleCost::Neutral;
46063
46064 // Fill in the non-negated ops with the original values.
46065 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
46066 if (!NewOps[i])
46067 NewOps[i] = Op.getOperand(i);
46068 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
46069 }
46070 case X86ISD::FRCP:
46071 if (SDValue NegOp0 =
46072 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
46073 ForCodeSize, Cost, Depth + 1))
46074 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
46075 break;
46076 }
46077
46078 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
46079 ForCodeSize, Cost, Depth);
46080}
46081
46082static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
46083 const X86Subtarget &Subtarget) {
46084 MVT VT = N->getSimpleValueType(0);
46085 // If we have integer vector types available, use the integer opcodes.
46086 if (!VT.isVector() || !Subtarget.hasSSE2())
46087 return SDValue();
46088
46089 SDLoc dl(N);
46090
46091 unsigned IntBits = VT.getScalarSizeInBits();
46092 MVT IntSVT = MVT::getIntegerVT(IntBits);
46093 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
46094
46095 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
46096 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
46097 unsigned IntOpcode;
46098 switch (N->getOpcode()) {
46099 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46099)
;
46100 case X86ISD::FOR: IntOpcode = ISD::OR; break;
46101 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
46102 case X86ISD::FAND: IntOpcode = ISD::AND; break;
46103 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
46104 }
46105 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
46106 return DAG.getBitcast(VT, IntOp);
46107}
46108
46109
46110/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
46111static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
46112 if (N->getOpcode() != ISD::XOR)
46113 return SDValue();
46114
46115 SDValue LHS = N->getOperand(0);
46116 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
46117 return SDValue();
46118
46119 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
46120 X86::CondCode(LHS->getConstantOperandVal(0)));
46121 SDLoc DL(N);
46122 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
46123}
46124
46125static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
46126 TargetLowering::DAGCombinerInfo &DCI,
46127 const X86Subtarget &Subtarget) {
46128 // If this is SSE1 only convert to FXOR to avoid scalarization.
46129 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
46130 N->getValueType(0) == MVT::v4i32) {
46131 return DAG.getBitcast(
46132 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
46133 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
46134 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
46135 }
46136
46137 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
46138 return Cmp;
46139
46140 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
46141 return R;
46142
46143 if (DCI.isBeforeLegalizeOps())
46144 return SDValue();
46145
46146 if (SDValue SetCC = foldXor1SetCC(N, DAG))
46147 return SetCC;
46148
46149 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
46150 return RV;
46151
46152 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
46153 return FPLogic;
46154
46155 return combineFneg(N, DAG, DCI, Subtarget);
46156}
46157
46158static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
46159 TargetLowering::DAGCombinerInfo &DCI,
46160 const X86Subtarget &Subtarget) {
46161 EVT VT = N->getValueType(0);
46162 unsigned NumBits = VT.getSizeInBits();
46163
46164 // TODO - Constant Folding.
46165
46166 // Simplify the inputs.
46167 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46168 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
46169 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
46170 return SDValue(N, 0);
46171
46172 return SDValue();
46173}
46174
46175static bool isNullFPScalarOrVectorConst(SDValue V) {
46176 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
46177}
46178
46179/// If a value is a scalar FP zero or a vector FP zero (potentially including
46180/// undefined elements), return a zero constant that may be used to fold away
46181/// that value. In the case of a vector, the returned constant will not contain
46182/// undefined elements even if the input parameter does. This makes it suitable
46183/// to be used as a replacement operand with operations (eg, bitwise-and) where
46184/// an undef should not propagate.
46185static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
46186 const X86Subtarget &Subtarget) {
46187 if (!isNullFPScalarOrVectorConst(V))
46188 return SDValue();
46189
46190 if (V.getValueType().isVector())
46191 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
46192
46193 return V;
46194}
46195
46196static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
46197 const X86Subtarget &Subtarget) {
46198 SDValue N0 = N->getOperand(0);
46199 SDValue N1 = N->getOperand(1);
46200 EVT VT = N->getValueType(0);
46201 SDLoc DL(N);
46202
46203 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
46204 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
46205 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
46206 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
46207 return SDValue();
46208
46209 auto isAllOnesConstantFP = [](SDValue V) {
46210 if (V.getSimpleValueType().isVector())
46211 return ISD::isBuildVectorAllOnes(V.getNode());
46212 auto *C = dyn_cast<ConstantFPSDNode>(V);
46213 return C && C->getConstantFPValue()->isAllOnesValue();
46214 };
46215
46216 // fand (fxor X, -1), Y --> fandn X, Y
46217 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
46218 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
46219
46220 // fand X, (fxor Y, -1) --> fandn Y, X
46221 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
46222 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
46223
46224 return SDValue();
46225}
46226
46227/// Do target-specific dag combines on X86ISD::FAND nodes.
46228static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
46229 const X86Subtarget &Subtarget) {
46230 // FAND(0.0, x) -> 0.0
46231 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
46232 return V;
46233
46234 // FAND(x, 0.0) -> 0.0
46235 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
46236 return V;
46237
46238 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
46239 return V;
46240
46241 return lowerX86FPLogicOp(N, DAG, Subtarget);
46242}
46243
46244/// Do target-specific dag combines on X86ISD::FANDN nodes.
46245static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
46246 const X86Subtarget &Subtarget) {
46247 // FANDN(0.0, x) -> x
46248 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
46249 return N->getOperand(1);
46250
46251 // FANDN(x, 0.0) -> 0.0
46252 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
46253 return V;
46254
46255 return lowerX86FPLogicOp(N, DAG, Subtarget);
46256}
46257
46258/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
46259static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
46260 TargetLowering::DAGCombinerInfo &DCI,
46261 const X86Subtarget &Subtarget) {
46262 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)((N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD
::FXOR) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46262, __PRETTY_FUNCTION__))
;
46263
46264 // F[X]OR(0.0, x) -> x
46265 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
46266 return N->getOperand(1);
46267
46268 // F[X]OR(x, 0.0) -> x
46269 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
46270 return N->getOperand(0);
46271
46272 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
46273 return NewVal;
46274
46275 return lowerX86FPLogicOp(N, DAG, Subtarget);
46276}
46277
46278/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
46279static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
46280 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)((N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD
::FMAX) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46280, __PRETTY_FUNCTION__))
;
46281
46282 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
46283 if (!DAG.getTarget().Options.NoNaNsFPMath ||
46284 !DAG.getTarget().Options.NoSignedZerosFPMath)
46285 return SDValue();
46286
46287 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
46288 // into FMINC and FMAXC, which are Commutative operations.
46289 unsigned NewOp = 0;
46290 switch (N->getOpcode()) {
46291 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46291)
;
46292 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
46293 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
46294 }
46295
46296 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
46297 N->getOperand(0), N->getOperand(1));
46298}
46299
46300static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
46301 const X86Subtarget &Subtarget) {
46302 if (Subtarget.useSoftFloat())
46303 return SDValue();
46304
46305 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46306
46307 EVT VT = N->getValueType(0);
46308 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46309 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46310 (VT.isVector() && TLI.isTypeLegal(VT))))
46311 return SDValue();
46312
46313 SDValue Op0 = N->getOperand(0);
46314 SDValue Op1 = N->getOperand(1);
46315 SDLoc DL(N);
46316 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
46317
46318 // If we don't have to respect NaN inputs, this is a direct translation to x86
46319 // min/max instructions.
46320 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
46321 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
46322
46323 // If one of the operands is known non-NaN use the native min/max instructions
46324 // with the non-NaN input as second operand.
46325 if (DAG.isKnownNeverNaN(Op1))
46326 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
46327 if (DAG.isKnownNeverNaN(Op0))
46328 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
46329
46330 // If we have to respect NaN inputs, this takes at least 3 instructions.
46331 // Favor a library call when operating on a scalar and minimizing code size.
46332 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
46333 return SDValue();
46334
46335 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
46336 VT);
46337
46338 // There are 4 possibilities involving NaN inputs, and these are the required
46339 // outputs:
46340 // Op1
46341 // Num NaN
46342 // ----------------
46343 // Num | Max | Op0 |
46344 // Op0 ----------------
46345 // NaN | Op1 | NaN |
46346 // ----------------
46347 //
46348 // The SSE FP max/min instructions were not designed for this case, but rather
46349 // to implement:
46350 // Min = Op1 < Op0 ? Op1 : Op0
46351 // Max = Op1 > Op0 ? Op1 : Op0
46352 //
46353 // So they always return Op0 if either input is a NaN. However, we can still
46354 // use those instructions for fmaxnum by selecting away a NaN input.
46355
46356 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
46357 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
46358 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
46359
46360 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
46361 // are NaN, the NaN value of Op1 is the result.
46362 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
46363}
46364
46365static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
46366 TargetLowering::DAGCombinerInfo &DCI) {
46367 EVT VT = N->getValueType(0);
46368 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46369
46370 APInt KnownUndef, KnownZero;
46371 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
46372 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
46373 KnownZero, DCI))
46374 return SDValue(N, 0);
46375
46376 // Convert a full vector load into vzload when not all bits are needed.
46377 SDValue In = N->getOperand(0);
46378 MVT InVT = In.getSimpleValueType();
46379 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
46380 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
46381 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((InVT.is128BitVector() && "Expected 128-bit input vector"
) ? static_cast<void> (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46381, __PRETTY_FUNCTION__))
;
46382 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
46383 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
46384 MVT MemVT = MVT::getIntegerVT(NumBits);
46385 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
46386 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
46387 SDLoc dl(N);
46388 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
46389 DAG.getBitcast(InVT, VZLoad));
46390 DCI.CombineTo(N, Convert);
46391 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
46392 DCI.recursivelyDeleteUnusedNodes(LN);
46393 return SDValue(N, 0);
46394 }
46395 }
46396
46397 return SDValue();
46398}
46399
46400static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
46401 TargetLowering::DAGCombinerInfo &DCI) {
46402 bool IsStrict = N->isTargetStrictFPOpcode();
46403 EVT VT = N->getValueType(0);
46404
46405 // Convert a full vector load into vzload when not all bits are needed.
46406 SDValue In = N->getOperand(IsStrict ? 1 : 0);
46407 MVT InVT = In.getSimpleValueType();
46408 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
46409 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
46410 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((InVT.is128BitVector() && "Expected 128-bit input vector"
) ? static_cast<void> (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46410, __PRETTY_FUNCTION__))
;
46411 LoadSDNode *LN = cast<LoadSDNode>(In);
46412 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
46413 MVT MemVT = MVT::getFloatingPointVT(NumBits);
46414 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
46415 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
46416 SDLoc dl(N);
46417 if (IsStrict) {
46418 SDValue Convert =
46419 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
46420 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
46421 DCI.CombineTo(N, Convert, Convert.getValue(1));
46422 } else {
46423 SDValue Convert =
46424 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
46425 DCI.CombineTo(N, Convert);
46426 }
46427 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
46428 DCI.recursivelyDeleteUnusedNodes(LN);
46429 return SDValue(N, 0);
46430 }
46431 }
46432
46433 return SDValue();
46434}
46435
46436/// Do target-specific dag combines on X86ISD::ANDNP nodes.
46437static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
46438 TargetLowering::DAGCombinerInfo &DCI,
46439 const X86Subtarget &Subtarget) {
46440 MVT VT = N->getSimpleValueType(0);
46441
46442 // ANDNP(0, x) -> x
46443 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
46444 return N->getOperand(1);
46445
46446 // ANDNP(x, 0) -> 0
46447 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
46448 return DAG.getConstant(0, SDLoc(N), VT);
46449
46450 // Turn ANDNP back to AND if input is inverted.
46451 if (SDValue Not = IsNOT(N->getOperand(0), DAG))
46452 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
46453 N->getOperand(1));
46454
46455 // Attempt to recursively combine a bitmask ANDNP with shuffles.
46456 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
46457 SDValue Op(N, 0);
46458 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46459 return Res;
46460 }
46461
46462 return SDValue();
46463}
46464
46465static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
46466 TargetLowering::DAGCombinerInfo &DCI) {
46467 SDValue N1 = N->getOperand(1);
46468
46469 // BT ignores high bits in the bit index operand.
46470 unsigned BitWidth = N1.getValueSizeInBits();
46471 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
46472 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
46473 if (N->getOpcode() != ISD::DELETED_NODE)
46474 DCI.AddToWorklist(N);
46475 return SDValue(N, 0);
46476 }
46477
46478 return SDValue();
46479}
46480
46481static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
46482 TargetLowering::DAGCombinerInfo &DCI) {
46483 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
46484 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
46485
46486 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
46487 APInt KnownUndef, KnownZero;
46488 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46489 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
46490 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
46491 DCI)) {
46492 if (N->getOpcode() != ISD::DELETED_NODE)
46493 DCI.AddToWorklist(N);
46494 return SDValue(N, 0);
46495 }
46496
46497 // Convert a full vector load into vzload when not all bits are needed.
46498 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
46499 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
46500 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
46501 SDLoc dl(N);
46502 if (IsStrict) {
46503 SDValue Convert = DAG.getNode(
46504 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
46505 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
46506 DCI.CombineTo(N, Convert, Convert.getValue(1));
46507 } else {
46508 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
46509 DAG.getBitcast(MVT::v8i16, VZLoad));
46510 DCI.CombineTo(N, Convert);
46511 }
46512
46513 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
46514 DCI.recursivelyDeleteUnusedNodes(LN);
46515 return SDValue(N, 0);
46516 }
46517 }
46518 }
46519
46520 return SDValue();
46521}
46522
46523// Try to combine sext_in_reg of a cmov of constants by extending the constants.
46524static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
46525 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((N->getOpcode() == ISD::SIGN_EXTEND_INREG) ? static_cast<
void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46525, __PRETTY_FUNCTION__))
;
46526
46527 EVT DstVT = N->getValueType(0);
46528
46529 SDValue N0 = N->getOperand(0);
46530 SDValue N1 = N->getOperand(1);
46531 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
46532
46533 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
46534 return SDValue();
46535
46536 // Look through single use any_extends / truncs.
46537 SDValue IntermediateBitwidthOp;
46538 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
46539 N0.hasOneUse()) {
46540 IntermediateBitwidthOp = N0;
46541 N0 = N0.getOperand(0);
46542 }
46543
46544 // See if we have a single use cmov.
46545 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
46546 return SDValue();
46547
46548 SDValue CMovOp0 = N0.getOperand(0);
46549 SDValue CMovOp1 = N0.getOperand(1);
46550
46551 // Make sure both operands are constants.
46552 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
46553 !isa<ConstantSDNode>(CMovOp1.getNode()))
46554 return SDValue();
46555
46556 SDLoc DL(N);
46557
46558 // If we looked through an any_extend/trunc above, add one to the constants.
46559 if (IntermediateBitwidthOp) {
46560 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
46561 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
46562 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
46563 }
46564
46565 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
46566 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
46567
46568 EVT CMovVT = DstVT;
46569 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
46570 if (DstVT == MVT::i16) {
46571 CMovVT = MVT::i32;
46572 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
46573 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
46574 }
46575
46576 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
46577 N0.getOperand(2), N0.getOperand(3));
46578
46579 if (CMovVT != DstVT)
46580 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
46581
46582 return CMov;
46583}
46584
46585static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
46586 const X86Subtarget &Subtarget) {
46587 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((N->getOpcode() == ISD::SIGN_EXTEND_INREG) ? static_cast<
void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46587, __PRETTY_FUNCTION__))
;
46588
46589 if (SDValue V = combineSextInRegCmov(N, DAG))
46590 return V;
46591
46592 EVT VT = N->getValueType(0);
46593 SDValue N0 = N->getOperand(0);
46594 SDValue N1 = N->getOperand(1);
46595 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
46596 SDLoc dl(N);
46597
46598 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
46599 // both SSE and AVX2 since there is no sign-extended shift right
46600 // operation on a vector with 64-bit elements.
46601 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
46602 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
46603 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
46604 N0.getOpcode() == ISD::SIGN_EXTEND)) {
46605 SDValue N00 = N0.getOperand(0);
46606
46607 // EXTLOAD has a better solution on AVX2,
46608 // it may be replaced with X86ISD::VSEXT node.
46609 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
46610 if (!ISD::isNormalLoad(N00.getNode()))
46611 return SDValue();
46612
46613 // Attempt to promote any comparison mask ops before moving the
46614 // SIGN_EXTEND_INREG in the way.
46615 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
46616 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
46617
46618 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
46619 SDValue Tmp =
46620 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
46621 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
46622 }
46623 }
46624 return SDValue();
46625}
46626
46627/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
46628/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
46629/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
46630/// opportunities to combine math ops, use an LEA, or use a complex addressing
46631/// mode. This can eliminate extend, add, and shift instructions.
46632static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
46633 const X86Subtarget &Subtarget) {
46634 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
46635 Ext->getOpcode() != ISD::ZERO_EXTEND)
46636 return SDValue();
46637
46638 // TODO: This should be valid for other integer types.
46639 EVT VT = Ext->getValueType(0);
46640 if (VT != MVT::i64)
46641 return SDValue();
46642
46643 SDValue Add = Ext->getOperand(0);
46644 if (Add.getOpcode() != ISD::ADD)
46645 return SDValue();
46646
46647 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
46648 bool NSW = Add->getFlags().hasNoSignedWrap();
46649 bool NUW = Add->getFlags().hasNoUnsignedWrap();
46650
46651 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
46652 // into the 'zext'
46653 if ((Sext && !NSW) || (!Sext && !NUW))
46654 return SDValue();
46655
46656 // Having a constant operand to the 'add' ensures that we are not increasing
46657 // the instruction count because the constant is extended for free below.
46658 // A constant operand can also become the displacement field of an LEA.
46659 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
46660 if (!AddOp1)
46661 return SDValue();
46662
46663 // Don't make the 'add' bigger if there's no hope of combining it with some
46664 // other 'add' or 'shl' instruction.
46665 // TODO: It may be profitable to generate simpler LEA instructions in place
46666 // of single 'add' instructions, but the cost model for selecting an LEA
46667 // currently has a high threshold.
46668 bool HasLEAPotential = false;
46669 for (auto *User : Ext->uses()) {
46670 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
46671 HasLEAPotential = true;
46672 break;
46673 }
46674 }
46675 if (!HasLEAPotential)
46676 return SDValue();
46677
46678 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
46679 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
46680 SDValue AddOp0 = Add.getOperand(0);
46681 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
46682 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
46683
46684 // The wider add is guaranteed to not wrap because both operands are
46685 // sign-extended.
46686 SDNodeFlags Flags;
46687 Flags.setNoSignedWrap(NSW);
46688 Flags.setNoUnsignedWrap(NUW);
46689 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
46690}
46691
46692// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
46693// operands and the result of CMOV is not used anywhere else - promote CMOV
46694// itself instead of promoting its result. This could be beneficial, because:
46695// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
46696// (or more) pseudo-CMOVs only when they go one-after-another and
46697// getting rid of result extension code after CMOV will help that.
46698// 2) Promotion of constant CMOV arguments is free, hence the
46699// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
46700// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
46701// promotion is also good in terms of code-size.
46702// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
46703// promotion).
46704static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
46705 SDValue CMovN = Extend->getOperand(0);
46706 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
46707 return SDValue();
46708
46709 EVT TargetVT = Extend->getValueType(0);
46710 unsigned ExtendOpcode = Extend->getOpcode();
46711 SDLoc DL(Extend);
46712
46713 EVT VT = CMovN.getValueType();
46714 SDValue CMovOp0 = CMovN.getOperand(0);
46715 SDValue CMovOp1 = CMovN.getOperand(1);
46716
46717 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
46718 !isa<ConstantSDNode>(CMovOp1.getNode()))
46719 return SDValue();
46720
46721 // Only extend to i32 or i64.
46722 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
46723 return SDValue();
46724
46725 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
46726 // are free.
46727 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
46728 return SDValue();
46729
46730 // If this a zero extend to i64, we should only extend to i32 and use a free
46731 // zero extend to finish.
46732 EVT ExtendVT = TargetVT;
46733 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
46734 ExtendVT = MVT::i32;
46735
46736 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
46737 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
46738
46739 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
46740 CMovN.getOperand(2), CMovN.getOperand(3));
46741
46742 // Finish extending if needed.
46743 if (ExtendVT != TargetVT)
46744 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
46745
46746 return Res;
46747}
46748
46749// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
46750// This is more or less the reverse of combineBitcastvxi1.
46751static SDValue
46752combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
46753 TargetLowering::DAGCombinerInfo &DCI,
46754 const X86Subtarget &Subtarget) {
46755 unsigned Opcode = N->getOpcode();
46756 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
46757 Opcode != ISD::ANY_EXTEND)
46758 return SDValue();
46759 if (!DCI.isBeforeLegalizeOps())
46760 return SDValue();
46761 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46762 return SDValue();
46763
46764 SDValue N0 = N->getOperand(0);
46765 EVT VT = N->getValueType(0);
46766 EVT SVT = VT.getScalarType();
46767 EVT InSVT = N0.getValueType().getScalarType();
46768 unsigned EltSizeInBits = SVT.getSizeInBits();
46769
46770 // Input type must be extending a bool vector (bit-casted from a scalar
46771 // integer) to legal integer types.
46772 if (!VT.isVector())
46773 return SDValue();
46774 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
46775 return SDValue();
46776 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
46777 return SDValue();
46778
46779 SDValue N00 = N0.getOperand(0);
46780 EVT SclVT = N0.getOperand(0).getValueType();
46781 if (!SclVT.isScalarInteger())
46782 return SDValue();
46783
46784 SDLoc DL(N);
46785 SDValue Vec;
46786 SmallVector<int, 32> ShuffleMask;
46787 unsigned NumElts = VT.getVectorNumElements();
46788 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")((NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size"
) ? static_cast<void> (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46788, __PRETTY_FUNCTION__))
;
46789
46790 // Broadcast the scalar integer to the vector elements.
46791 if (NumElts > EltSizeInBits) {
46792 // If the scalar integer is greater than the vector element size, then we
46793 // must split it down into sub-sections for broadcasting. For example:
46794 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
46795 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
46796 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale"
) ? static_cast<void> (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46796, __PRETTY_FUNCTION__))
;
46797 unsigned Scale = NumElts / EltSizeInBits;
46798 EVT BroadcastVT =
46799 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
46800 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46801 Vec = DAG.getBitcast(VT, Vec);
46802
46803 for (unsigned i = 0; i != Scale; ++i)
46804 ShuffleMask.append(EltSizeInBits, i);
46805 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46806 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
46807 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
46808 // If we have register broadcast instructions, use the scalar size as the
46809 // element type for the shuffle. Then cast to the wider element type. The
46810 // widened bits won't be used, and this might allow the use of a broadcast
46811 // load.
46812 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale"
) ? static_cast<void> (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46812, __PRETTY_FUNCTION__))
;
46813 unsigned Scale = EltSizeInBits / NumElts;
46814 EVT BroadcastVT =
46815 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
46816 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46817 ShuffleMask.append(NumElts * Scale, 0);
46818 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
46819 Vec = DAG.getBitcast(VT, Vec);
46820 } else {
46821 // For smaller scalar integers, we can simply any-extend it to the vector
46822 // element size (we don't care about the upper bits) and broadcast it to all
46823 // elements.
46824 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
46825 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
46826 ShuffleMask.append(NumElts, 0);
46827 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46828 }
46829
46830 // Now, mask the relevant bit in each element.
46831 SmallVector<SDValue, 32> Bits;
46832 for (unsigned i = 0; i != NumElts; ++i) {
46833 int BitIdx = (i % EltSizeInBits);
46834 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
46835 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
46836 }
46837 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
46838 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
46839
46840 // Compare against the bitmask and extend the result.
46841 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
46842 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
46843 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
46844
46845 // For SEXT, this is now done, otherwise shift the result down for
46846 // zero-extension.
46847 if (Opcode == ISD::SIGN_EXTEND)
46848 return Vec;
46849 return DAG.getNode(ISD::SRL, DL, VT, Vec,
46850 DAG.getConstant(EltSizeInBits - 1, DL, VT));
46851}
46852
46853// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
46854// result type.
46855static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
46856 const X86Subtarget &Subtarget) {
46857 SDValue N0 = N->getOperand(0);
46858 EVT VT = N->getValueType(0);
46859 SDLoc dl(N);
46860
46861 // Only do this combine with AVX512 for vector extends.
46862 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
46863 return SDValue();
46864
46865 // Only combine legal element types.
46866 EVT SVT = VT.getVectorElementType();
46867 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
46868 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
46869 return SDValue();
46870
46871 // We can only do this if the vector size in 256 bits or less.
46872 unsigned Size = VT.getSizeInBits();
46873 if (Size > 256 && Subtarget.useAVX512Regs())
46874 return SDValue();
46875
46876 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
46877 // that's the only integer compares with we have.
46878 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
46879 if (ISD::isUnsignedIntSetCC(CC))
46880 return SDValue();
46881
46882 // Only do this combine if the extension will be fully consumed by the setcc.
46883 EVT N00VT = N0.getOperand(0).getValueType();
46884 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
46885 if (Size != MatchingVecType.getSizeInBits())
46886 return SDValue();
46887
46888 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
46889
46890 if (N->getOpcode() == ISD::ZERO_EXTEND)
46891 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
46892
46893 return Res;
46894}
46895
46896static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
46897 TargetLowering::DAGCombinerInfo &DCI,
46898 const X86Subtarget &Subtarget) {
46899 SDValue N0 = N->getOperand(0);
46900 EVT VT = N->getValueType(0);
46901 SDLoc DL(N);
46902
46903 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
46904 if (!DCI.isBeforeLegalizeOps() &&
46905 N0.getOpcode() == X86ISD::SETCC_CARRY) {
46906 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
46907 N0->getOperand(1));
46908 bool ReplaceOtherUses = !N0.hasOneUse();
46909 DCI.CombineTo(N, Setcc);
46910 // Replace other uses with a truncate of the widened setcc_carry.
46911 if (ReplaceOtherUses) {
46912 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
46913 N0.getValueType(), Setcc);
46914 DCI.CombineTo(N0.getNode(), Trunc);
46915 }
46916
46917 return SDValue(N, 0);
46918 }
46919
46920 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
46921 return NewCMov;
46922
46923 if (!DCI.isBeforeLegalizeOps())
46924 return SDValue();
46925
46926 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
46927 return V;
46928
46929 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
46930 return V;
46931
46932 if (VT.isVector()) {
46933 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
46934 return R;
46935
46936 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
46937 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
46938 }
46939
46940 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
46941 return NewAdd;
46942
46943 return SDValue();
46944}
46945
46946static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
46947 TargetLowering::DAGCombinerInfo &DCI,
46948 const X86Subtarget &Subtarget) {
46949 SDLoc dl(N);
46950 EVT VT = N->getValueType(0);
46951 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
46952
46953 // Let legalize expand this if it isn't a legal type yet.
46954 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46955 if (!TLI.isTypeLegal(VT))
46956 return SDValue();
46957
46958 SDValue A = N->getOperand(IsStrict ? 1 : 0);
46959 SDValue B = N->getOperand(IsStrict ? 2 : 1);
46960 SDValue C = N->getOperand(IsStrict ? 3 : 2);
46961
46962 // If the operation allows fast-math and the target does not support FMA,
46963 // split this into mul+add to avoid libcall(s).
46964 SDNodeFlags Flags = N->getFlags();
46965 if (!IsStrict && Flags.hasAllowReassociation() &&
46966 TLI.isOperationExpand(ISD::FMA, VT)) {
46967 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
46968 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
46969 }
46970
46971 EVT ScalarVT = VT.getScalarType();
46972 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
46973 return SDValue();
46974
46975 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
46976 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
46977 bool LegalOperations = !DCI.isBeforeLegalizeOps();
46978 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
46979 CodeSize)) {
46980 V = NegV;
46981 return true;
46982 }
46983 // Look through extract_vector_elts. If it comes from an FNEG, create a
46984 // new extract from the FNEG input.
46985 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46986 isNullConstant(V.getOperand(1))) {
46987 SDValue Vec = V.getOperand(0);
46988 if (SDValue NegV = TLI.getCheaperNegatedExpression(
46989 Vec, DAG, LegalOperations, CodeSize)) {
46990 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
46991 NegV, V.getOperand(1));
46992 return true;
46993 }
46994 }
46995
46996 return false;
46997 };
46998
46999 // Do not convert the passthru input of scalar intrinsics.
47000 // FIXME: We could allow negations of the lower element only.
47001 bool NegA = invertIfNegative(A);
47002 bool NegB = invertIfNegative(B);
47003 bool NegC = invertIfNegative(C);
47004
47005 if (!NegA && !NegB && !NegC)
47006 return SDValue();
47007
47008 unsigned NewOpcode =
47009 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
47010
47011 if (IsStrict) {
47012 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")((N->getNumOperands() == 4 && "Shouldn't be greater than 4"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47012, __PRETTY_FUNCTION__))
;
47013 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
47014 {N->getOperand(0), A, B, C});
47015 } else {
47016 if (N->getNumOperands() == 4)
47017 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
47018 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
47019 }
47020}
47021
47022// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
47023// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
47024static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
47025 TargetLowering::DAGCombinerInfo &DCI) {
47026 SDLoc dl(N);
47027 EVT VT = N->getValueType(0);
47028 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47029 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
47030 bool LegalOperations = !DCI.isBeforeLegalizeOps();
47031
47032 SDValue N2 = N->getOperand(2);
47033
47034 SDValue NegN2 =
47035 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
47036 if (!NegN2)
47037 return SDValue();
47038 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
47039
47040 if (N->getNumOperands() == 4)
47041 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
47042 NegN2, N->getOperand(3));
47043 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
47044 NegN2);
47045}
47046
47047static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
47048 TargetLowering::DAGCombinerInfo &DCI,
47049 const X86Subtarget &Subtarget) {
47050 SDLoc dl(N);
47051 SDValue N0 = N->getOperand(0);
47052 EVT VT = N->getValueType(0);
47053
47054 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
47055 // FIXME: Is this needed? We don't seem to have any tests for it.
47056 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
47057 N0.getOpcode() == X86ISD::SETCC_CARRY) {
47058 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
47059 N0->getOperand(1));
47060 bool ReplaceOtherUses = !N0.hasOneUse();
47061 DCI.CombineTo(N, Setcc);
47062 // Replace other uses with a truncate of the widened setcc_carry.
47063 if (ReplaceOtherUses) {
47064 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
47065 N0.getValueType(), Setcc);
47066 DCI.CombineTo(N0.getNode(), Trunc);
47067 }
47068
47069 return SDValue(N, 0);
47070 }
47071
47072 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
47073 return NewCMov;
47074
47075 if (DCI.isBeforeLegalizeOps())
47076 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
47077 return V;
47078
47079 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
47080 return V;
47081
47082 if (VT.isVector())
47083 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
47084 return R;
47085
47086 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
47087 return NewAdd;
47088
47089 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
47090 return R;
47091
47092 // TODO: Combine with any target/faux shuffle.
47093 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
47094 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
47095 SDValue N00 = N0.getOperand(0);
47096 SDValue N01 = N0.getOperand(1);
47097 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
47098 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
47099 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
47100 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
47101 return concatSubVectors(N00, N01, DAG, dl);
47102 }
47103 }
47104
47105 return SDValue();
47106}
47107
47108/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
47109/// recognizable memcmp expansion.
47110static bool isOrXorXorTree(SDValue X, bool Root = true) {
47111 if (X.getOpcode() == ISD::OR)
47112 return isOrXorXorTree(X.getOperand(0), false) &&
47113 isOrXorXorTree(X.getOperand(1), false);
47114 if (Root)
47115 return false;
47116 return X.getOpcode() == ISD::XOR;
47117}
47118
47119/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
47120/// expansion.
47121template<typename F>
47122static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
47123 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
47124 SDValue Op0 = X.getOperand(0);
47125 SDValue Op1 = X.getOperand(1);
47126 if (X.getOpcode() == ISD::OR) {
47127 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
47128 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
47129 if (VecVT != CmpVT)
47130 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
47131 if (HasPT)
47132 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
47133 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
47134 } else if (X.getOpcode() == ISD::XOR) {
47135 SDValue A = SToV(Op0);
47136 SDValue B = SToV(Op1);
47137 if (VecVT != CmpVT)
47138 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
47139 if (HasPT)
47140 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
47141 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
47142 }
47143 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47143)
;
47144}
47145
47146/// Try to map a 128-bit or larger integer comparison to vector instructions
47147/// before type legalization splits it up into chunks.
47148static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
47149 const X86Subtarget &Subtarget) {
47150 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
47151 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"
) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47151, __PRETTY_FUNCTION__))
;
47152
47153 // We're looking for an oversized integer equality comparison.
47154 SDValue X = SetCC->getOperand(0);
47155 SDValue Y = SetCC->getOperand(1);
47156 EVT OpVT = X.getValueType();
47157 unsigned OpSize = OpVT.getSizeInBits();
47158 if (!OpVT.isScalarInteger() || OpSize < 128)
47159 return SDValue();
47160
47161 // Ignore a comparison with zero because that gets special treatment in
47162 // EmitTest(). But make an exception for the special case of a pair of
47163 // logically-combined vector-sized operands compared to zero. This pattern may
47164 // be generated by the memcmp expansion pass with oversized integer compares
47165 // (see PR33325).
47166 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
47167 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
47168 return SDValue();
47169
47170 // Don't perform this combine if constructing the vector will be expensive.
47171 auto IsVectorBitCastCheap = [](SDValue X) {
47172 X = peekThroughBitcasts(X);
47173 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
47174 X.getOpcode() == ISD::LOAD;
47175 };
47176 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
47177 !IsOrXorXorTreeCCZero)
47178 return SDValue();
47179
47180 EVT VT = SetCC->getValueType(0);
47181 SDLoc DL(SetCC);
47182
47183 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
47184 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
47185 // Otherwise use PCMPEQ (plus AND) and mask testing.
47186 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
47187 (OpSize == 256 && Subtarget.hasAVX()) ||
47188 (OpSize == 512 && Subtarget.useAVX512Regs())) {
47189 bool HasPT = Subtarget.hasSSE41();
47190
47191 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
47192 // vector registers are essentially free. (Technically, widening registers
47193 // prevents load folding, but the tradeoff is worth it.)
47194 bool PreferKOT = Subtarget.preferMaskRegisters();
47195 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
47196
47197 EVT VecVT = MVT::v16i8;
47198 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
47199 if (OpSize == 256) {
47200 VecVT = MVT::v32i8;
47201 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
47202 }
47203 EVT CastVT = VecVT;
47204 bool NeedsAVX512FCast = false;
47205 if (OpSize == 512 || NeedZExt) {
47206 if (Subtarget.hasBWI()) {
47207 VecVT = MVT::v64i8;
47208 CmpVT = MVT::v64i1;
47209 if (OpSize == 512)
47210 CastVT = VecVT;
47211 } else {
47212 VecVT = MVT::v16i32;
47213 CmpVT = MVT::v16i1;
47214 CastVT = OpSize == 512 ? VecVT :
47215 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
47216 NeedsAVX512FCast = true;
47217 }
47218 }
47219
47220 auto ScalarToVector = [&](SDValue X) -> SDValue {
47221 bool TmpZext = false;
47222 EVT TmpCastVT = CastVT;
47223 if (X.getOpcode() == ISD::ZERO_EXTEND) {
47224 SDValue OrigX = X.getOperand(0);
47225 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
47226 if (OrigSize < OpSize) {
47227 if (OrigSize == 128) {
47228 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
47229 X = OrigX;
47230 TmpZext = true;
47231 } else if (OrigSize == 256) {
47232 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
47233 X = OrigX;
47234 TmpZext = true;
47235 }
47236 }
47237 }
47238 X = DAG.getBitcast(TmpCastVT, X);
47239 if (!NeedZExt && !TmpZext)
47240 return X;
47241 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
47242 DAG.getConstant(0, DL, VecVT), X,
47243 DAG.getVectorIdxConstant(0, DL));
47244 };
47245
47246 SDValue Cmp;
47247 if (IsOrXorXorTreeCCZero) {
47248 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
47249 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
47250 // Use 2 vector equality compares and 'and' the results before doing a
47251 // MOVMSK.
47252 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
47253 } else {
47254 SDValue VecX = ScalarToVector(X);
47255 SDValue VecY = ScalarToVector(Y);
47256 if (VecVT != CmpVT) {
47257 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
47258 } else if (HasPT) {
47259 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
47260 } else {
47261 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
47262 }
47263 }
47264 // AVX512 should emit a setcc that will lower to kortest.
47265 if (VecVT != CmpVT) {
47266 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
47267 CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
47268 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
47269 DAG.getConstant(0, DL, KRegVT), CC);
47270 }
47271 if (HasPT) {
47272 SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
47273 Cmp);
47274 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
47275 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
47276 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
47277 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
47278 }
47279 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
47280 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
47281 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
47282 assert(Cmp.getValueType() == MVT::v16i8 &&((Cmp.getValueType() == MVT::v16i8 && "Non 128-bit vector on pre-SSE41 target"
) ? static_cast<void> (0) : __assert_fail ("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47283, __PRETTY_FUNCTION__))
47283 "Non 128-bit vector on pre-SSE41 target")((Cmp.getValueType() == MVT::v16i8 && "Non 128-bit vector on pre-SSE41 target"
) ? static_cast<void> (0) : __assert_fail ("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47283, __PRETTY_FUNCTION__))
;
47284 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
47285 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
47286 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
47287 }
47288
47289 return SDValue();
47290}
47291
47292static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
47293 const X86Subtarget &Subtarget) {
47294 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
47295 const SDValue LHS = N->getOperand(0);
47296 const SDValue RHS = N->getOperand(1);
47297 EVT VT = N->getValueType(0);
47298 EVT OpVT = LHS.getValueType();
47299 SDLoc DL(N);
47300
47301 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
47302 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
47303 return V;
47304
47305 if (VT == MVT::i1 && isNullConstant(RHS)) {
47306 SDValue X86CC;
47307 if (SDValue V =
47308 MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
47309 return DAG.getNode(ISD::TRUNCATE, DL, VT,
47310 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
47311 }
47312 }
47313
47314 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
47315 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
47316 // Using temporaries to avoid messing up operand ordering for later
47317 // transformations if this doesn't work.
47318 SDValue Op0 = LHS;
47319 SDValue Op1 = RHS;
47320 ISD::CondCode TmpCC = CC;
47321 // Put build_vector on the right.
47322 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
47323 std::swap(Op0, Op1);
47324 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
47325 }
47326
47327 bool IsSEXT0 =
47328 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
47329 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
47330 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
47331
47332 if (IsSEXT0 && IsVZero1) {
47333 assert(VT == Op0.getOperand(0).getValueType() &&((VT == Op0.getOperand(0).getValueType() && "Unexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47334, __PRETTY_FUNCTION__))
47334 "Unexpected operand type")((VT == Op0.getOperand(0).getValueType() && "Unexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47334, __PRETTY_FUNCTION__))
;
47335 if (TmpCC == ISD::SETGT)
47336 return DAG.getConstant(0, DL, VT);
47337 if (TmpCC == ISD::SETLE)
47338 return DAG.getConstant(1, DL, VT);
47339 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
47340 return DAG.getNOT(DL, Op0.getOperand(0), VT);
47341
47342 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47343, __PRETTY_FUNCTION__))
47343 "Unexpected condition code!")(((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47343, __PRETTY_FUNCTION__))
;
47344 return Op0.getOperand(0);
47345 }
47346 }
47347
47348 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
47349 // pre-promote its result type since vXi1 vectors don't get promoted
47350 // during type legalization.
47351 // NOTE: The element count check is to ignore operand types that need to
47352 // go through type promotion to a 128-bit vector.
47353 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
47354 VT.getVectorElementType() == MVT::i1 &&
47355 (OpVT.getVectorElementType() == MVT::i8 ||
47356 OpVT.getVectorElementType() == MVT::i16)) {
47357 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
47358 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
47359 }
47360
47361 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
47362 // to avoid scalarization via legalization because v4i32 is not a legal type.
47363 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
47364 LHS.getValueType() == MVT::v4f32)
47365 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
47366
47367 return SDValue();
47368}
47369
47370static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
47371 TargetLowering::DAGCombinerInfo &DCI,
47372 const X86Subtarget &Subtarget) {
47373 SDValue Src = N->getOperand(0);
47374 MVT SrcVT = Src.getSimpleValueType();
47375 MVT VT = N->getSimpleValueType(0);
47376 unsigned NumBits = VT.getScalarSizeInBits();
47377 unsigned NumElts = SrcVT.getVectorNumElements();
47378
47379 // Perform constant folding.
47380 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
47381 assert(VT == MVT::i32 && "Unexpected result type")((VT == MVT::i32 && "Unexpected result type") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected result type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47381, __PRETTY_FUNCTION__))
;
47382 APInt Imm(32, 0);
47383 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
47384 if (!Src.getOperand(Idx).isUndef() &&
47385 Src.getConstantOperandAPInt(Idx).isNegative())
47386 Imm.setBit(Idx);
47387 }
47388 return DAG.getConstant(Imm, SDLoc(N), VT);
47389 }
47390
47391 // Look through int->fp bitcasts that don't change the element width.
47392 unsigned EltWidth = SrcVT.getScalarSizeInBits();
47393 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
47394 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
47395 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
47396
47397 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
47398 // with scalar comparisons.
47399 if (SDValue NotSrc = IsNOT(Src, DAG)) {
47400 SDLoc DL(N);
47401 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
47402 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
47403 return DAG.getNode(ISD::XOR, DL, VT,
47404 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
47405 DAG.getConstant(NotMask, DL, VT));
47406 }
47407
47408 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
47409 // results with scalar comparisons.
47410 if (Src.getOpcode() == X86ISD::PCMPGT &&
47411 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
47412 SDLoc DL(N);
47413 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
47414 return DAG.getNode(ISD::XOR, DL, VT,
47415 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
47416 DAG.getConstant(NotMask, DL, VT));
47417 }
47418
47419 // Simplify the inputs.
47420 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47421 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
47422 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47423 return SDValue(N, 0);
47424
47425 return SDValue();
47426}
47427
47428static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
47429 TargetLowering::DAGCombinerInfo &DCI) {
47430 // With vector masks we only demand the upper bit of the mask.
47431 SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
47432 if (Mask.getScalarValueSizeInBits() != 1) {
47433 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47434 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
47435 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
47436 if (N->getOpcode() != ISD::DELETED_NODE)
47437 DCI.AddToWorklist(N);
47438 return SDValue(N, 0);
47439 }
47440 }
47441
47442 return SDValue();
47443}
47444
47445static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
47446 SDValue Index, SDValue Base, SDValue Scale,
47447 SelectionDAG &DAG) {
47448 SDLoc DL(GorS);
47449
47450 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
47451 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
47452 Gather->getMask(), Base, Index, Scale } ;
47453 return DAG.getMaskedGather(Gather->getVTList(),
47454 Gather->getMemoryVT(), DL, Ops,
47455 Gather->getMemOperand(),
47456 Gather->getIndexType(),
47457 Gather->getExtensionType());
47458 }
47459 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
47460 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
47461 Scatter->getMask(), Base, Index, Scale };
47462 return DAG.getMaskedScatter(Scatter->getVTList(),
47463 Scatter->getMemoryVT(), DL,
47464 Ops, Scatter->getMemOperand(),
47465 Scatter->getIndexType(),
47466 Scatter->isTruncatingStore());
47467}
47468
47469static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
47470 TargetLowering::DAGCombinerInfo &DCI) {
47471 SDLoc DL(N);
47472 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
47473 SDValue Index = GorS->getIndex();
47474 SDValue Base = GorS->getBasePtr();
47475 SDValue Scale = GorS->getScale();
47476
47477 if (DCI.isBeforeLegalize()) {
47478 unsigned IndexWidth = Index.getScalarValueSizeInBits();
47479
47480 // Shrink constant indices if they are larger than 32-bits.
47481 // Only do this before legalize types since v2i64 could become v2i32.
47482 // FIXME: We could check that the type is legal if we're after legalize
47483 // types, but then we would need to construct test cases where that happens.
47484 // FIXME: We could support more than just constant vectors, but we need to
47485 // careful with costing. A truncate that can be optimized out would be fine.
47486 // Otherwise we might only want to create a truncate if it avoids a split.
47487 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
47488 if (BV->isConstant() && IndexWidth > 32 &&
47489 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
47490 unsigned NumElts = Index.getValueType().getVectorNumElements();
47491 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
47492 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
47493 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
47494 }
47495 }
47496
47497 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
47498 // there are sufficient sign bits. Only do this before legalize types to
47499 // avoid creating illegal types in truncate.
47500 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
47501 Index.getOpcode() == ISD::ZERO_EXTEND) &&
47502 IndexWidth > 32 &&
47503 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
47504 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
47505 unsigned NumElts = Index.getValueType().getVectorNumElements();
47506 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
47507 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
47508 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
47509 }
47510 }
47511
47512 if (DCI.isBeforeLegalizeOps()) {
47513 unsigned IndexWidth = Index.getScalarValueSizeInBits();
47514
47515 // Make sure the index is either i32 or i64
47516 if (IndexWidth != 32 && IndexWidth != 64) {
47517 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
47518 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
47519 Index.getValueType().getVectorNumElements());
47520 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
47521 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
47522 }
47523 }
47524
47525 // With vector masks we only demand the upper bit of the mask.
47526 SDValue Mask = GorS->getMask();
47527 if (Mask.getScalarValueSizeInBits() != 1) {
47528 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47529 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
47530 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
47531 if (N->getOpcode() != ISD::DELETED_NODE)
47532 DCI.AddToWorklist(N);
47533 return SDValue(N, 0);
47534 }
47535 }
47536
47537 return SDValue();
47538}
47539
47540// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
47541static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
47542 const X86Subtarget &Subtarget) {
47543 SDLoc DL(N);
47544 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
47545 SDValue EFLAGS = N->getOperand(1);
47546
47547 // Try to simplify the EFLAGS and condition code operands.
47548 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
47549 return getSETCC(CC, Flags, DL, DAG);
47550
47551 return SDValue();
47552}
47553
47554/// Optimize branch condition evaluation.
47555static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
47556 const X86Subtarget &Subtarget) {
47557 SDLoc DL(N);
47558 SDValue EFLAGS = N->getOperand(3);
47559 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
47560
47561 // Try to simplify the EFLAGS and condition code operands.
47562 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
47563 // RAUW them under us.
47564 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
47565 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
47566 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
47567 N->getOperand(1), Cond, Flags);
47568 }
47569
47570 return SDValue();
47571}
47572
47573// TODO: Could we move this to DAGCombine?
47574static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
47575 SelectionDAG &DAG) {
47576 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
47577 // to optimize away operation when it's from a constant.
47578 //
47579 // The general transformation is:
47580 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
47581 // AND(VECTOR_CMP(x,y), constant2)
47582 // constant2 = UNARYOP(constant)
47583
47584 // Early exit if this isn't a vector operation, the operand of the
47585 // unary operation isn't a bitwise AND, or if the sizes of the operations
47586 // aren't the same.
47587 EVT VT = N->getValueType(0);
47588 bool IsStrict = N->isStrictFPOpcode();
47589 unsigned NumEltBits = VT.getScalarSizeInBits();
47590 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
47591 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
47592 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
47593 VT.getSizeInBits() != Op0.getValueSizeInBits())
47594 return SDValue();
47595
47596 // Now check that the other operand of the AND is a constant. We could
47597 // make the transformation for non-constant splats as well, but it's unclear
47598 // that would be a benefit as it would not eliminate any operations, just
47599 // perform one more step in scalar code before moving to the vector unit.
47600 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
47601 // Bail out if the vector isn't a constant.
47602 if (!BV->isConstant())
47603 return SDValue();
47604
47605 // Everything checks out. Build up the new and improved node.
47606 SDLoc DL(N);
47607 EVT IntVT = BV->getValueType(0);
47608 // Create a new constant of the appropriate type for the transformed
47609 // DAG.
47610 SDValue SourceConst;
47611 if (IsStrict)
47612 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
47613 {N->getOperand(0), SDValue(BV, 0)});
47614 else
47615 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
47616 // The AND node needs bitcasts to/from an integer vector type around it.
47617 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
47618 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
47619 MaskConst);
47620 SDValue Res = DAG.getBitcast(VT, NewAnd);
47621 if (IsStrict)
47622 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
47623 return Res;
47624 }
47625
47626 return SDValue();
47627}
47628
47629/// If we are converting a value to floating-point, try to replace scalar
47630/// truncate of an extracted vector element with a bitcast. This tries to keep
47631/// the sequence on XMM registers rather than moving between vector and GPRs.
47632static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
47633 // TODO: This is currently only used by combineSIntToFP, but it is generalized
47634 // to allow being called by any similar cast opcode.
47635 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
47636 SDValue Trunc = N->getOperand(0);
47637 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
47638 return SDValue();
47639
47640 SDValue ExtElt = Trunc.getOperand(0);
47641 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47642 !isNullConstant(ExtElt.getOperand(1)))
47643 return SDValue();
47644
47645 EVT TruncVT = Trunc.getValueType();
47646 EVT SrcVT = ExtElt.getValueType();
47647 unsigned DestWidth = TruncVT.getSizeInBits();
47648 unsigned SrcWidth = SrcVT.getSizeInBits();
47649 if (SrcWidth % DestWidth != 0)
47650 return SDValue();
47651
47652 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
47653 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
47654 unsigned VecWidth = SrcVecVT.getSizeInBits();
47655 unsigned NumElts = VecWidth / DestWidth;
47656 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
47657 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
47658 SDLoc DL(N);
47659 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
47660 BitcastVec, ExtElt.getOperand(1));
47661 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
47662}
47663
47664static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
47665 const X86Subtarget &Subtarget) {
47666 bool IsStrict = N->isStrictFPOpcode();
47667 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
47668 EVT VT = N->getValueType(0);
47669 EVT InVT = Op0.getValueType();
47670
47671 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
47672 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
47673 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
47674 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
47675 SDLoc dl(N);
47676 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
47677 InVT.getVectorNumElements());
47678 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
47679
47680 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
47681 if (IsStrict)
47682 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
47683 {N->getOperand(0), P});
47684 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
47685 }
47686
47687 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
47688 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
47689 // the optimization here.
47690 if (DAG.SignBitIsZero(Op0)) {
47691 if (IsStrict)
47692 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
47693 {N->getOperand(0), Op0});
47694 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
47695 }
47696
47697 return SDValue();
47698}
47699
47700static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
47701 TargetLowering::DAGCombinerInfo &DCI,
47702 const X86Subtarget &Subtarget) {
47703 // First try to optimize away the conversion entirely when it's
47704 // conditionally from a constant. Vectors only.
47705 bool IsStrict = N->isStrictFPOpcode();
47706 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
47707 return Res;
47708
47709 // Now move on to more general possibilities.
47710 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
47711 EVT VT = N->getValueType(0);
47712 EVT InVT = Op0.getValueType();
47713
47714 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
47715 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
47716 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
47717 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
47718 SDLoc dl(N);
47719 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
47720 InVT.getVectorNumElements());
47721 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
47722 if (IsStrict)
47723 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
47724 {N->getOperand(0), P});
47725 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
47726 }
47727
47728 // Without AVX512DQ we only support i64 to float scalar conversion. For both
47729 // vectors and scalars, see if we know that the upper bits are all the sign
47730 // bit, in which case we can truncate the input to i32 and convert from that.
47731 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
47732 unsigned BitWidth = InVT.getScalarSizeInBits();
47733 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
47734 if (NumSignBits >= (BitWidth - 31)) {
47735 EVT TruncVT = MVT::i32;
47736 if (InVT.isVector())
47737 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
47738 InVT.getVectorNumElements());
47739 SDLoc dl(N);
47740 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
47741 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
47742 if (IsStrict)
47743 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
47744 {N->getOperand(0), Trunc});
47745 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
47746 }
47747 // If we're after legalize and the type is v2i32 we need to shuffle and
47748 // use CVTSI2P.
47749 assert(InVT == MVT::v2i64 && "Unexpected VT!")((InVT == MVT::v2i64 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47749, __PRETTY_FUNCTION__))
;
47750 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
47751 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
47752 { 0, 2, -1, -1 });
47753 if (IsStrict)
47754 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
47755 {N->getOperand(0), Shuf});
47756 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
47757 }
47758 }
47759
47760 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
47761 // a 32-bit target where SSE doesn't support i64->FP operations.
47762 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
47763 Op0.getOpcode() == ISD::LOAD) {
47764 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
47765
47766 // This transformation is not supported if the result type is f16 or f128.
47767 if (VT == MVT::f16 || VT == MVT::f128)
47768 return SDValue();
47769
47770 // If we have AVX512DQ we can use packed conversion instructions unless
47771 // the VT is f80.
47772 if (Subtarget.hasDQI() && VT != MVT::f80)
47773 return SDValue();
47774
47775 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
47776 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
47777 std::pair<SDValue, SDValue> Tmp =
47778 Subtarget.getTargetLowering()->BuildFILD(
47779 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
47780 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
47781 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
47782 return Tmp.first;
47783 }
47784 }
47785
47786 if (IsStrict)
47787 return SDValue();
47788
47789 if (SDValue V = combineToFPTruncExtElt(N, DAG))
47790 return V;
47791
47792 return SDValue();
47793}
47794
47795static bool needCarryOrOverflowFlag(SDValue Flags) {
47796 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((Flags.getValueType() == MVT::i32 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47796, __PRETTY_FUNCTION__))
;
47797
47798 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
47799 UI != UE; ++UI) {
47800 SDNode *User = *UI;
47801
47802 X86::CondCode CC;
47803 switch (User->getOpcode()) {
47804 default:
47805 // Be conservative.
47806 return true;
47807 case X86ISD::SETCC:
47808 case X86ISD::SETCC_CARRY:
47809 CC = (X86::CondCode)User->getConstantOperandVal(0);
47810 break;
47811 case X86ISD::BRCOND:
47812 CC = (X86::CondCode)User->getConstantOperandVal(2);
47813 break;
47814 case X86ISD::CMOV:
47815 CC = (X86::CondCode)User->getConstantOperandVal(2);
47816 break;
47817 }
47818
47819 switch (CC) {
47820 default: break;
47821 case X86::COND_A: case X86::COND_AE:
47822 case X86::COND_B: case X86::COND_BE:
47823 case X86::COND_O: case X86::COND_NO:
47824 case X86::COND_G: case X86::COND_GE:
47825 case X86::COND_L: case X86::COND_LE:
47826 return true;
47827 }
47828 }
47829
47830 return false;
47831}
47832
47833static bool onlyZeroFlagUsed(SDValue Flags) {
47834 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((Flags.getValueType() == MVT::i32 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47834, __PRETTY_FUNCTION__))
;
47835
47836 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
47837 UI != UE; ++UI) {
47838 SDNode *User = *UI;
47839
47840 unsigned CCOpNo;
47841 switch (User->getOpcode()) {
47842 default:
47843 // Be conservative.
47844 return false;
47845 case X86ISD::SETCC: CCOpNo = 0; break;
47846 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
47847 case X86ISD::BRCOND: CCOpNo = 2; break;
47848 case X86ISD::CMOV: CCOpNo = 2; break;
47849 }
47850
47851 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
47852 if (CC != X86::COND_E && CC != X86::COND_NE)
47853 return false;
47854 }
47855
47856 return true;
47857}
47858
47859static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
47860 // Only handle test patterns.
47861 if (!isNullConstant(N->getOperand(1)))
47862 return SDValue();
47863
47864 // If we have a CMP of a truncated binop, see if we can make a smaller binop
47865 // and use its flags directly.
47866 // TODO: Maybe we should try promoting compares that only use the zero flag
47867 // first if we can prove the upper bits with computeKnownBits?
47868 SDLoc dl(N);
47869 SDValue Op = N->getOperand(0);
47870 EVT VT = Op.getValueType();
47871
47872 // If we have a constant logical shift that's only used in a comparison
47873 // against zero turn it into an equivalent AND. This allows turning it into
47874 // a TEST instruction later.
47875 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
47876 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
47877 onlyZeroFlagUsed(SDValue(N, 0))) {
47878 unsigned BitWidth = VT.getSizeInBits();
47879 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
47880 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
47881 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
47882 APInt Mask = Op.getOpcode() == ISD::SRL
47883 ? APInt::getHighBitsSet(BitWidth, MaskBits)
47884 : APInt::getLowBitsSet(BitWidth, MaskBits);
47885 if (Mask.isSignedIntN(32)) {
47886 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
47887 DAG.getConstant(Mask, dl, VT));
47888 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
47889 DAG.getConstant(0, dl, VT));
47890 }
47891 }
47892 }
47893
47894 // Look for a truncate with a single use.
47895 if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
47896 return SDValue();
47897
47898 Op = Op.getOperand(0);
47899
47900 // Arithmetic op can only have one use.
47901 if (!Op.hasOneUse())
47902 return SDValue();
47903
47904 unsigned NewOpc;
47905 switch (Op.getOpcode()) {
47906 default: return SDValue();
47907 case ISD::AND:
47908 // Skip and with constant. We have special handling for and with immediate
47909 // during isel to generate test instructions.
47910 if (isa<ConstantSDNode>(Op.getOperand(1)))
47911 return SDValue();
47912 NewOpc = X86ISD::AND;
47913 break;
47914 case ISD::OR: NewOpc = X86ISD::OR; break;
47915 case ISD::XOR: NewOpc = X86ISD::XOR; break;
47916 case ISD::ADD:
47917 // If the carry or overflow flag is used, we can't truncate.
47918 if (needCarryOrOverflowFlag(SDValue(N, 0)))
47919 return SDValue();
47920 NewOpc = X86ISD::ADD;
47921 break;
47922 case ISD::SUB:
47923 // If the carry or overflow flag is used, we can't truncate.
47924 if (needCarryOrOverflowFlag(SDValue(N, 0)))
47925 return SDValue();
47926 NewOpc = X86ISD::SUB;
47927 break;
47928 }
47929
47930 // We found an op we can narrow. Truncate its inputs.
47931 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
47932 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
47933
47934 // Use a X86 specific opcode to avoid DAG combine messing with it.
47935 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
47936 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
47937
47938 // For AND, keep a CMP so that we can match the test pattern.
47939 if (NewOpc == X86ISD::AND)
47940 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
47941 DAG.getConstant(0, dl, VT));
47942
47943 // Return the flags.
47944 return Op.getValue(1);
47945}
47946
47947static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
47948 TargetLowering::DAGCombinerInfo &DCI) {
47949 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode
()) && "Expected X86ISD::ADD or X86ISD::SUB") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47950, __PRETTY_FUNCTION__))
47950 "Expected X86ISD::ADD or X86ISD::SUB")(((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode
()) && "Expected X86ISD::ADD or X86ISD::SUB") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47950, __PRETTY_FUNCTION__))
;
47951
47952 SDLoc DL(N);
47953 SDValue LHS = N->getOperand(0);
47954 SDValue RHS = N->getOperand(1);
47955 MVT VT = LHS.getSimpleValueType();
47956 unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
47957
47958 // If we don't use the flag result, simplify back to a generic ADD/SUB.
47959 if (!N->hasAnyUseOfValue(1)) {
47960 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
47961 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
47962 }
47963
47964 // Fold any similar generic ADD/SUB opcodes to reuse this node.
47965 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
47966 SDValue Ops[] = {N0, N1};
47967 SDVTList VTs = DAG.getVTList(N->getValueType(0));
47968 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
47969 SDValue Op(N, 0);
47970 if (Negate)
47971 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
47972 DCI.CombineTo(GenericAddSub, Op);
47973 }
47974 };
47975 MatchGeneric(LHS, RHS, false);
47976 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
47977
47978 return SDValue();
47979}
47980
47981static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
47982 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
47983 MVT VT = N->getSimpleValueType(0);
47984 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
47985 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
47986 N->getOperand(0), N->getOperand(1),
47987 Flags);
47988 }
47989
47990 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
47991 // iff the flag result is dead.
47992 SDValue Op0 = N->getOperand(0);
47993 SDValue Op1 = N->getOperand(1);
47994 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
47995 !N->hasAnyUseOfValue(1))
47996 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
47997 Op0.getOperand(1), N->getOperand(2));
47998
47999 return SDValue();
48000}
48001
48002// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
48003static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
48004 TargetLowering::DAGCombinerInfo &DCI) {
48005 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
48006 // the result is either zero or one (depending on the input carry bit).
48007 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
48008 if (X86::isZeroNode(N->getOperand(0)) &&
48009 X86::isZeroNode(N->getOperand(1)) &&
48010 // We don't have a good way to replace an EFLAGS use, so only do this when
48011 // dead right now.
48012 SDValue(N, 1).use_empty()) {
48013 SDLoc DL(N);
48014 EVT VT = N->getValueType(0);
48015 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
48016 SDValue Res1 =
48017 DAG.getNode(ISD::AND, DL, VT,
48018 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
48019 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
48020 N->getOperand(2)),
48021 DAG.getConstant(1, DL, VT));
48022 return DCI.CombineTo(N, Res1, CarryOut);
48023 }
48024
48025 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
48026 MVT VT = N->getSimpleValueType(0);
48027 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
48028 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
48029 N->getOperand(0), N->getOperand(1),
48030 Flags);
48031 }
48032
48033 return SDValue();
48034}
48035
48036/// If this is an add or subtract where one operand is produced by a cmp+setcc,
48037/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
48038/// with CMP+{ADC, SBB}.
48039static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
48040 bool IsSub = N->getOpcode() == ISD::SUB;
48041 SDValue X = N->getOperand(0);
48042 SDValue Y = N->getOperand(1);
48043
48044 // If this is an add, canonicalize a zext operand to the RHS.
48045 // TODO: Incomplete? What if both sides are zexts?
48046 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
48047 Y.getOpcode() != ISD::ZERO_EXTEND)
48048 std::swap(X, Y);
48049
48050 // Look through a one-use zext.
48051 bool PeekedThroughZext = false;
48052 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
48053 Y = Y.getOperand(0);
48054 PeekedThroughZext = true;
48055 }
48056
48057 // If this is an add, canonicalize a setcc operand to the RHS.
48058 // TODO: Incomplete? What if both sides are setcc?
48059 // TODO: Should we allow peeking through a zext of the other operand?
48060 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
48061 Y.getOpcode() != X86ISD::SETCC)
48062 std::swap(X, Y);
48063
48064 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
48065 return SDValue();
48066
48067 SDLoc DL(N);
48068 EVT VT = N->getValueType(0);
48069 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
48070
48071 // If X is -1 or 0, then we have an opportunity to avoid constants required in
48072 // the general case below.
48073 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
48074 if (ConstantX) {
48075 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
48076 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
48077 // This is a complicated way to get -1 or 0 from the carry flag:
48078 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
48079 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
48080 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
48081 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
48082 Y.getOperand(1));
48083 }
48084
48085 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
48086 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
48087 SDValue EFLAGS = Y->getOperand(1);
48088 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
48089 EFLAGS.getValueType().isInteger() &&
48090 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
48091 // Swap the operands of a SUB, and we have the same pattern as above.
48092 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
48093 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
48094 SDValue NewSub = DAG.getNode(
48095 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
48096 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
48097 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
48098 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
48099 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
48100 NewEFLAGS);
48101 }
48102 }
48103 }
48104
48105 if (CC == X86::COND_B) {
48106 // X + SETB Z --> adc X, 0
48107 // X - SETB Z --> sbb X, 0
48108 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
48109 DAG.getVTList(VT, MVT::i32), X,
48110 DAG.getConstant(0, DL, VT), Y.getOperand(1));
48111 }
48112
48113 if (CC == X86::COND_A) {
48114 SDValue EFLAGS = Y.getOperand(1);
48115 // Try to convert COND_A into COND_B in an attempt to facilitate
48116 // materializing "setb reg".
48117 //
48118 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
48119 // cannot take an immediate as its first operand.
48120 //
48121 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
48122 EFLAGS.getValueType().isInteger() &&
48123 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
48124 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
48125 EFLAGS.getNode()->getVTList(),
48126 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
48127 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
48128 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
48129 DAG.getVTList(VT, MVT::i32), X,
48130 DAG.getConstant(0, DL, VT), NewEFLAGS);
48131 }
48132 }
48133
48134 if (CC == X86::COND_AE) {
48135 // X + SETAE --> sbb X, -1
48136 // X - SETAE --> adc X, -1
48137 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
48138 DAG.getVTList(VT, MVT::i32), X,
48139 DAG.getConstant(-1, DL, VT), Y.getOperand(1));
48140 }
48141
48142 if (CC == X86::COND_BE) {
48143 // X + SETBE --> sbb X, -1
48144 // X - SETBE --> adc X, -1
48145 SDValue EFLAGS = Y.getOperand(1);
48146 // Try to convert COND_BE into COND_AE in an attempt to facilitate
48147 // materializing "setae reg".
48148 //
48149 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
48150 // cannot take an immediate as its first operand.
48151 //
48152 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
48153 EFLAGS.getValueType().isInteger() &&
48154 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
48155 SDValue NewSub = DAG.getNode(
48156 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
48157 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
48158 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
48159 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
48160 DAG.getVTList(VT, MVT::i32), X,
48161 DAG.getConstant(-1, DL, VT), NewEFLAGS);
48162 }
48163 }
48164
48165 if (CC != X86::COND_E && CC != X86::COND_NE)
48166 return SDValue();
48167
48168 SDValue Cmp = Y.getOperand(1);
48169 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
48170 !X86::isZeroNode(Cmp.getOperand(1)) ||
48171 !Cmp.getOperand(0).getValueType().isInteger())
48172 return SDValue();
48173
48174 SDValue Z = Cmp.getOperand(0);
48175 EVT ZVT = Z.getValueType();
48176
48177 // If X is -1 or 0, then we have an opportunity to avoid constants required in
48178 // the general case below.
48179 if (ConstantX) {
48180 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
48181 // fake operands:
48182 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
48183 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
48184 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
48185 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
48186 SDValue Zero = DAG.getConstant(0, DL, ZVT);
48187 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
48188 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
48189 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
48190 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
48191 SDValue(Neg.getNode(), 1));
48192 }
48193
48194 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
48195 // with fake operands:
48196 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
48197 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
48198 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
48199 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
48200 SDValue One = DAG.getConstant(1, DL, ZVT);
48201 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
48202 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
48203 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
48204 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
48205 Cmp1.getValue(1));
48206 }
48207 }
48208
48209 // (cmp Z, 1) sets the carry flag if Z is 0.
48210 SDValue One = DAG.getConstant(1, DL, ZVT);
48211 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
48212 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
48213
48214 // Add the flags type for ADC/SBB nodes.
48215 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
48216
48217 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
48218 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
48219 if (CC == X86::COND_NE)
48220 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
48221 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
48222
48223 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
48224 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
48225 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
48226 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
48227}
48228
48229static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
48230 const SDLoc &DL, EVT VT,
48231 const X86Subtarget &Subtarget) {
48232 // Example of pattern we try to detect:
48233 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
48234 //(add (build_vector (extract_elt t, 0),
48235 // (extract_elt t, 2),
48236 // (extract_elt t, 4),
48237 // (extract_elt t, 6)),
48238 // (build_vector (extract_elt t, 1),
48239 // (extract_elt t, 3),
48240 // (extract_elt t, 5),
48241 // (extract_elt t, 7)))
48242
48243 if (!Subtarget.hasSSE2())
48244 return SDValue();
48245
48246 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
48247 Op1.getOpcode() != ISD::BUILD_VECTOR)
48248 return SDValue();
48249
48250 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
48251 VT.getVectorNumElements() < 4 ||
48252 !isPowerOf2_32(VT.getVectorNumElements()))
48253 return SDValue();
48254
48255 // Check if one of Op0,Op1 is of the form:
48256 // (build_vector (extract_elt Mul, 0),
48257 // (extract_elt Mul, 2),
48258 // (extract_elt Mul, 4),
48259 // ...
48260 // the other is of the form:
48261 // (build_vector (extract_elt Mul, 1),
48262 // (extract_elt Mul, 3),
48263 // (extract_elt Mul, 5),
48264 // ...
48265 // and identify Mul.
48266 SDValue Mul;
48267 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
48268 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
48269 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
48270 // TODO: Be more tolerant to undefs.
48271 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
48272 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
48273 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
48274 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
48275 return SDValue();
48276 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
48277 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
48278 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
48279 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
48280 if (!Const0L || !Const1L || !Const0H || !Const1H)
48281 return SDValue();
48282 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
48283 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
48284 // Commutativity of mul allows factors of a product to reorder.
48285 if (Idx0L > Idx1L)
48286 std::swap(Idx0L, Idx1L);
48287 if (Idx0H > Idx1H)
48288 std::swap(Idx0H, Idx1H);
48289 // Commutativity of add allows pairs of factors to reorder.
48290 if (Idx0L > Idx0H) {
48291 std::swap(Idx0L, Idx0H);
48292 std::swap(Idx1L, Idx1H);
48293 }
48294 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
48295 Idx1H != 2 * i + 3)
48296 return SDValue();
48297 if (!Mul) {
48298 // First time an extract_elt's source vector is visited. Must be a MUL
48299 // with 2X number of vector elements than the BUILD_VECTOR.
48300 // Both extracts must be from same MUL.
48301 Mul = Op0L->getOperand(0);
48302 if (Mul->getOpcode() != ISD::MUL ||
48303 Mul.getValueType().getVectorNumElements() != 2 * e)
48304 return SDValue();
48305 }
48306 // Check that the extract is from the same MUL previously seen.
48307 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
48308 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
48309 return SDValue();
48310 }
48311
48312 // Check if the Mul source can be safely shrunk.
48313 ShrinkMode Mode;
48314 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
48315 Mode == ShrinkMode::MULU16)
48316 return SDValue();
48317
48318 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
48319 VT.getVectorNumElements() * 2);
48320 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
48321 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
48322
48323 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48324 ArrayRef<SDValue> Ops) {
48325 EVT InVT = Ops[0].getValueType();
48326 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((InVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48326, __PRETTY_FUNCTION__))
;
48327 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
48328 InVT.getVectorNumElements() / 2);
48329 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
48330 };
48331 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
48332}
48333
48334// Attempt to turn this pattern into PMADDWD.
48335// (add (mul (sext (build_vector)), (sext (build_vector))),
48336// (mul (sext (build_vector)), (sext (build_vector)))
48337static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
48338 const SDLoc &DL, EVT VT,
48339 const X86Subtarget &Subtarget) {
48340 if (!Subtarget.hasSSE2())
48341 return SDValue();
48342
48343 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
48344 return SDValue();
48345
48346 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
48347 VT.getVectorNumElements() < 4 ||
48348 !isPowerOf2_32(VT.getVectorNumElements()))
48349 return SDValue();
48350
48351 SDValue N00 = N0.getOperand(0);
48352 SDValue N01 = N0.getOperand(1);
48353 SDValue N10 = N1.getOperand(0);
48354 SDValue N11 = N1.getOperand(1);
48355
48356 // All inputs need to be sign extends.
48357 // TODO: Support ZERO_EXTEND from known positive?
48358 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
48359 N01.getOpcode() != ISD::SIGN_EXTEND ||
48360 N10.getOpcode() != ISD::SIGN_EXTEND ||
48361 N11.getOpcode() != ISD::SIGN_EXTEND)
48362 return SDValue();
48363
48364 // Peek through the extends.
48365 N00 = N00.getOperand(0);
48366 N01 = N01.getOperand(0);
48367 N10 = N10.getOperand(0);
48368 N11 = N11.getOperand(0);
48369
48370 // Must be extending from vXi16.
48371 EVT InVT = N00.getValueType();
48372 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
48373 N10.getValueType() != InVT || N11.getValueType() != InVT)
48374 return SDValue();
48375
48376 // All inputs should be build_vectors.
48377 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
48378 N01.getOpcode() != ISD::BUILD_VECTOR ||
48379 N10.getOpcode() != ISD::BUILD_VECTOR ||
48380 N11.getOpcode() != ISD::BUILD_VECTOR)
48381 return SDValue();
48382
48383 // For each element, we need to ensure we have an odd element from one vector
48384 // multiplied by the odd element of another vector and the even element from
48385 // one of the same vectors being multiplied by the even element from the
48386 // other vector. So we need to make sure for each element i, this operator
48387 // is being performed:
48388 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
48389 SDValue In0, In1;
48390 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
48391 SDValue N00Elt = N00.getOperand(i);
48392 SDValue N01Elt = N01.getOperand(i);
48393 SDValue N10Elt = N10.getOperand(i);
48394 SDValue N11Elt = N11.getOperand(i);
48395 // TODO: Be more tolerant to undefs.
48396 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
48397 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
48398 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
48399 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
48400 return SDValue();
48401 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
48402 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
48403 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
48404 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
48405 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
48406 return SDValue();
48407 unsigned IdxN00 = ConstN00Elt->getZExtValue();
48408 unsigned IdxN01 = ConstN01Elt->getZExtValue();
48409 unsigned IdxN10 = ConstN10Elt->getZExtValue();
48410 unsigned IdxN11 = ConstN11Elt->getZExtValue();
48411 // Add is commutative so indices can be reordered.
48412 if (IdxN00 > IdxN10) {
48413 std::swap(IdxN00, IdxN10);
48414 std::swap(IdxN01, IdxN11);
48415 }
48416 // N0 indices be the even element. N1 indices must be the next odd element.
48417 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
48418 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
48419 return SDValue();
48420 SDValue N00In = N00Elt.getOperand(0);
48421 SDValue N01In = N01Elt.getOperand(0);
48422 SDValue N10In = N10Elt.getOperand(0);
48423 SDValue N11In = N11Elt.getOperand(0);
48424 // First time we find an input capture it.
48425 if (!In0) {
48426 In0 = N00In;
48427 In1 = N01In;
48428 }
48429 // Mul is commutative so the input vectors can be in any order.
48430 // Canonicalize to make the compares easier.
48431 if (In0 != N00In)
48432 std::swap(N00In, N01In);
48433 if (In0 != N10In)
48434 std::swap(N10In, N11In);
48435 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
48436 return SDValue();
48437 }
48438
48439 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48440 ArrayRef<SDValue> Ops) {
48441 // Shrink by adding truncate nodes and let DAGCombine fold with the
48442 // sources.
48443 EVT OpVT = Ops[0].getValueType();
48444 assert(OpVT.getScalarType() == MVT::i16 &&((OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48445, __PRETTY_FUNCTION__))
48445 "Unexpected scalar element type")((OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48445, __PRETTY_FUNCTION__))
;
48446 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")((OpVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48446, __PRETTY_FUNCTION__))
;
48447 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
48448 OpVT.getVectorNumElements() / 2);
48449 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
48450 };
48451 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
48452 PMADDBuilder);
48453}
48454
48455static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
48456 const X86Subtarget &Subtarget) {
48457 EVT VT = N->getValueType(0);
48458 SDValue Op0 = N->getOperand(0);
48459 SDValue Op1 = N->getOperand(1);
48460 bool IsAdd = N->getOpcode() == ISD::ADD;
48461 assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode")(((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("(IsAdd || N->getOpcode() == ISD::SUB) && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48461, __PRETTY_FUNCTION__))
;
48462
48463 SmallVector<int, 8> PostShuffleMask;
48464 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
48465 VT == MVT::v8i32) &&
48466 Subtarget.hasSSSE3() &&
48467 isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
48468 auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
48469 ArrayRef<SDValue> Ops) {
48470 return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
48471 Ops[0].getValueType(), Ops);
48472 };
48473 SDValue HorizBinOp =
48474 SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
48475 if (!PostShuffleMask.empty())
48476 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
48477 DAG.getUNDEF(VT), PostShuffleMask);
48478 return HorizBinOp;
48479 }
48480
48481 return SDValue();
48482}
48483
48484static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
48485 TargetLowering::DAGCombinerInfo &DCI,
48486 const X86Subtarget &Subtarget) {
48487 EVT VT = N->getValueType(0);
48488 SDValue Op0 = N->getOperand(0);
48489 SDValue Op1 = N->getOperand(1);
48490
48491 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
48492 return MAdd;
48493 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
48494 return MAdd;
48495
48496 // Try to synthesize horizontal adds from adds of shuffles.
48497 if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
48498 return V;
48499
48500 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
48501 // (sub Y, (sext (vXi1 X))).
48502 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
48503 // generic DAG combine without a legal type check, but adding this there
48504 // caused regressions.
48505 if (VT.isVector()) {
48506 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48507 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
48508 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
48509 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
48510 SDLoc DL(N);
48511 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
48512 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
48513 }
48514
48515 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
48516 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
48517 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
48518 SDLoc DL(N);
48519 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
48520 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
48521 }
48522 }
48523
48524 return combineAddOrSubToADCOrSBB(N, DAG);
48525}
48526
48527static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
48528 const X86Subtarget &Subtarget) {
48529 SDValue Op0 = N->getOperand(0);
48530 SDValue Op1 = N->getOperand(1);
48531 EVT VT = N->getValueType(0);
48532
48533 if (!VT.isVector())
48534 return SDValue();
48535
48536 // PSUBUS is supported, starting from SSE2, but truncation for v8i32
48537 // is only worth it with SSSE3 (PSHUFB).
48538 EVT EltVT = VT.getVectorElementType();
48539 if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) &&
48540 !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
48541 !(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
48542 return SDValue();
48543
48544 SDValue SubusLHS, SubusRHS;
48545 // Try to find umax(a,b) - b or a - umin(a,b) patterns
48546 // they may be converted to subus(a,b).
48547 // TODO: Need to add IR canonicalization for this code.
48548 if (Op0.getOpcode() == ISD::UMAX) {
48549 SubusRHS = Op1;
48550 SDValue MaxLHS = Op0.getOperand(0);
48551 SDValue MaxRHS = Op0.getOperand(1);
48552 if (MaxLHS == Op1)
48553 SubusLHS = MaxRHS;
48554 else if (MaxRHS == Op1)
48555 SubusLHS = MaxLHS;
48556 else
48557 return SDValue();
48558 } else if (Op1.getOpcode() == ISD::UMIN) {
48559 SubusLHS = Op0;
48560 SDValue MinLHS = Op1.getOperand(0);
48561 SDValue MinRHS = Op1.getOperand(1);
48562 if (MinLHS == Op0)
48563 SubusRHS = MinRHS;
48564 else if (MinRHS == Op0)
48565 SubusRHS = MinLHS;
48566 else
48567 return SDValue();
48568 } else if (Op1.getOpcode() == ISD::TRUNCATE &&
48569 Op1.getOperand(0).getOpcode() == ISD::UMIN &&
48570 (EltVT == MVT::i8 || EltVT == MVT::i16)) {
48571 // Special case where the UMIN has been truncated. Try to push the truncate
48572 // further up. This is similar to the i32/i64 special processing.
48573 SubusLHS = Op0;
48574 SDValue MinLHS = Op1.getOperand(0).getOperand(0);
48575 SDValue MinRHS = Op1.getOperand(0).getOperand(1);
48576 EVT TruncVT = Op1.getOperand(0).getValueType();
48577 if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||
48578 TruncVT == MVT::v8i64)) &&
48579 !(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
48580 return SDValue();
48581 SDValue OpToSaturate;
48582 if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
48583 MinLHS.getOperand(0) == Op0)
48584 OpToSaturate = MinRHS;
48585 else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
48586 MinRHS.getOperand(0) == Op0)
48587 OpToSaturate = MinLHS;
48588 else
48589 return SDValue();
48590
48591 // Saturate the non-extended input and then truncate it.
48592 SDLoc DL(N);
48593 SDValue SaturationConst =
48594 DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
48595 VT.getScalarSizeInBits()),
48596 DL, TruncVT);
48597 SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
48598 SaturationConst);
48599 SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
48600 } else
48601 return SDValue();
48602
48603 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
48604 // special preprocessing in some cases.
48605 if (EltVT == MVT::i8 || EltVT == MVT::i16)
48606 return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);
48607
48608 assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) &&(((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64)
&& "Unexpected VT!") ? static_cast<void> (0) :
__assert_fail ("(VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48609, __PRETTY_FUNCTION__))
48609 "Unexpected VT!")(((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64)
&& "Unexpected VT!") ? static_cast<void> (0) :
__assert_fail ("(VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48609, __PRETTY_FUNCTION__))
;
48610
48611 // Special preprocessing case can be only applied
48612 // if the value was zero extended from 16 bit,
48613 // so we require first 16 bits to be zeros for 32 bit
48614 // values, or first 48 bits for 64 bit values.
48615 KnownBits Known = DAG.computeKnownBits(SubusLHS);
48616 unsigned NumZeros = Known.countMinLeadingZeros();
48617 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
48618 return SDValue();
48619
48620 EVT ExtType = SubusLHS.getValueType();
48621 EVT ShrinkedType;
48622 if (VT == MVT::v8i32 || VT == MVT::v8i64)
48623 ShrinkedType = MVT::v8i16;
48624 else
48625 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
48626
48627 // If SubusLHS is zeroextended - truncate SubusRHS to it's
48628 // size SubusRHS = umin(0xFFF.., SubusRHS).
48629 SDValue SaturationConst =
48630 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
48631 ShrinkedType.getScalarSizeInBits()),
48632 SDLoc(SubusLHS), ExtType);
48633 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
48634 SaturationConst);
48635 SDValue NewSubusLHS =
48636 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
48637 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
48638 SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
48639 NewSubusLHS, NewSubusRHS);
48640
48641 // Zero extend the result, it may be used somewhere as 32 bit,
48642 // if not zext and following trunc will shrink.
48643 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
48644}
48645
48646static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
48647 TargetLowering::DAGCombinerInfo &DCI,
48648 const X86Subtarget &Subtarget) {
48649 SDValue Op0 = N->getOperand(0);
48650 SDValue Op1 = N->getOperand(1);
48651
48652 // X86 can't encode an immediate LHS of a sub. See if we can push the
48653 // negation into a preceding instruction.
48654 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
48655 // If the RHS of the sub is a XOR with one use and a constant, invert the
48656 // immediate. Then add one to the LHS of the sub so we can turn
48657 // X-Y -> X+~Y+1, saving one register.
48658 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
48659 isa<ConstantSDNode>(Op1.getOperand(1))) {
48660 const APInt &XorC = Op1.getConstantOperandAPInt(1);
48661 EVT VT = Op0.getValueType();
48662 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
48663 Op1.getOperand(0),
48664 DAG.getConstant(~XorC, SDLoc(Op1), VT));
48665 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
48666 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
48667 }
48668 }
48669
48670 // Try to synthesize horizontal subs from subs of shuffles.
48671 if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
48672 return V;
48673
48674 // Try to create PSUBUS if SUB's argument is max/min
48675 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
48676 return V;
48677
48678 return combineAddOrSubToADCOrSBB(N, DAG);
48679}
48680
48681static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
48682 const X86Subtarget &Subtarget) {
48683 MVT VT = N->getSimpleValueType(0);
48684 SDLoc DL(N);
48685
48686 if (N->getOperand(0) == N->getOperand(1)) {
48687 if (N->getOpcode() == X86ISD::PCMPEQ)
48688 return DAG.getConstant(-1, DL, VT);
48689 if (N->getOpcode() == X86ISD::PCMPGT)
48690 return DAG.getConstant(0, DL, VT);
48691 }
48692
48693 return SDValue();
48694}
48695
48696/// Helper that combines an array of subvector ops as if they were the operands
48697/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
48698/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
48699static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
48700 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
48701 TargetLowering::DAGCombinerInfo &DCI,
48702 const X86Subtarget &Subtarget) {
48703 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")((Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48703, __PRETTY_FUNCTION__))
;
48704 unsigned EltSizeInBits = VT.getScalarSizeInBits();
48705
48706 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
48707 return DAG.getUNDEF(VT);
48708
48709 if (llvm::all_of(Ops, [](SDValue Op) {
48710 return ISD::isBuildVectorAllZeros(Op.getNode());
48711 }))
48712 return getZeroVector(VT, Subtarget, DAG, DL);
48713
48714 SDValue Op0 = Ops[0];
48715 bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
48716
48717 // Fold subvector loads into one.
48718 // If needed, look through bitcasts to get to the load.
48719 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
48720 bool Fast;
48721 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
48722 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
48723 *FirstLd->getMemOperand(), &Fast) &&
48724 Fast) {
48725 if (SDValue Ld =
48726 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
48727 return Ld;
48728 }
48729 }
48730
48731 // Repeated subvectors.
48732 if (IsSplat) {
48733 // If this broadcast/subv_broadcast is inserted into both halves, use a
48734 // larger broadcast/subv_broadcast.
48735 if (Op0.getOpcode() == X86ISD::VBROADCAST ||
48736 Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
48737 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
48738
48739 // If this broadcast_load is inserted into both halves, use a larger
48740 // broadcast_load. Update other uses to use an extracted subvector.
48741 if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
48742 auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
48743 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
48744 SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
48745 SDValue BcastLd = DAG.getMemIntrinsicNode(
48746 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
48747 MemIntr->getMemOperand());
48748 DAG.ReplaceAllUsesOfValueWith(
48749 Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
48750 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
48751 return BcastLd;
48752 }
48753
48754 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
48755 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
48756 (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
48757 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
48758 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
48759 Op0.getOperand(0),
48760 DAG.getIntPtrConstant(0, DL)));
48761
48762 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
48763 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
48764 (Subtarget.hasAVX2() ||
48765 (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
48766 Op0.getOperand(0).getValueType() == VT.getScalarType())
48767 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
48768
48769 // concat_vectors(extract_subvector(broadcast(x)),
48770 // extract_subvector(broadcast(x))) -> broadcast(x)
48771 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
48772 Op0.getOperand(0).getValueType() == VT) {
48773 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
48774 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
48775 return Op0.getOperand(0);
48776 }
48777 }
48778
48779 // Repeated opcode.
48780 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
48781 // but it currently struggles with different vector widths.
48782 if (llvm::all_of(Ops, [Op0](SDValue Op) {
48783 return Op.getOpcode() == Op0.getOpcode();
48784 })) {
48785 unsigned NumOps = Ops.size();
48786 switch (Op0.getOpcode()) {
48787 case X86ISD::SHUFP: {
48788 // Add SHUFPD support if/when necessary.
48789 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
48790 llvm::all_of(Ops, [Op0](SDValue Op) {
48791 return Op.getOperand(2) == Op0.getOperand(2);
48792 })) {
48793 SmallVector<SDValue, 2> LHS, RHS;
48794 for (unsigned i = 0; i != NumOps; ++i) {
48795 LHS.push_back(Ops[i].getOperand(0));
48796 RHS.push_back(Ops[i].getOperand(1));
48797 }
48798 return DAG.getNode(Op0.getOpcode(), DL, VT,
48799 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
48800 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
48801 Op0.getOperand(2));
48802 }
48803 break;
48804 }
48805 case X86ISD::PSHUFHW:
48806 case X86ISD::PSHUFLW:
48807 case X86ISD::PSHUFD:
48808 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
48809 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
48810 SmallVector<SDValue, 2> Src;
48811 for (unsigned i = 0; i != NumOps; ++i)
48812 Src.push_back(Ops[i].getOperand(0));
48813 return DAG.getNode(Op0.getOpcode(), DL, VT,
48814 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
48815 Op0.getOperand(1));
48816 }
48817 LLVM_FALLTHROUGH[[gnu::fallthrough]];
48818 case X86ISD::VPERMILPI:
48819 // TODO - add support for vXf64/vXi64 shuffles.
48820 if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
48821 Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
48822 SmallVector<SDValue, 2> Src;
48823 for (unsigned i = 0; i != NumOps; ++i)
48824 Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
48825 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
48826 Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
48827 Op0.getOperand(1));
48828 return DAG.getBitcast(VT, Res);
48829 }
48830 break;
48831 case X86ISD::VPERMV3:
48832 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
48833 MVT OpVT = Op0.getSimpleValueType();
48834 int NumSrcElts = OpVT.getVectorNumElements();
48835 SmallVector<int, 64> ConcatMask;
48836 for (unsigned i = 0; i != NumOps; ++i) {
48837 bool IsUnary;
48838 SmallVector<int, 64> SubMask;
48839 SmallVector<SDValue, 2> SubOps;
48840 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
48841 SubMask, IsUnary))
48842 break;
48843 for (int M : SubMask) {
48844 if (0 <= M) {
48845 M += M < NumSrcElts ? 0 : NumSrcElts;
48846 M += i * NumSrcElts;
48847 }
48848 ConcatMask.push_back(M);
48849 }
48850 }
48851 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
48852 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
48853 Ops[1].getOperand(0), DAG, DL);
48854 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
48855 Ops[1].getOperand(2), DAG, DL);
48856 MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
48857 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
48858 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
48859 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
48860 }
48861 }
48862 break;
48863 case X86ISD::VSHLI:
48864 case X86ISD::VSRAI:
48865 case X86ISD::VSRLI:
48866 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
48867 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
48868 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
48869 llvm::all_of(Ops, [Op0](SDValue Op) {
48870 return Op0.getOperand(1) == Op.getOperand(1);
48871 })) {
48872 SmallVector<SDValue, 2> Src;
48873 for (unsigned i = 0; i != NumOps; ++i)
48874 Src.push_back(Ops[i].getOperand(0));
48875 return DAG.getNode(Op0.getOpcode(), DL, VT,
48876 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
48877 Op0.getOperand(1));
48878 }
48879 break;
48880 case X86ISD::VPERMI:
48881 case X86ISD::VROTLI:
48882 case X86ISD::VROTRI:
48883 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
48884 llvm::all_of(Ops, [Op0](SDValue Op) {
48885 return Op0.getOperand(1) == Op.getOperand(1);
48886 })) {
48887 SmallVector<SDValue, 2> Src;
48888 for (unsigned i = 0; i != NumOps; ++i)
48889 Src.push_back(Ops[i].getOperand(0));
48890 return DAG.getNode(Op0.getOpcode(), DL, VT,
48891 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
48892 Op0.getOperand(1));
48893 }
48894 break;
48895 case ISD::AND:
48896 case ISD::OR:
48897 case ISD::XOR:
48898 case X86ISD::ANDNP:
48899 // TODO: Add 256-bit support.
48900 if (!IsSplat && VT.is512BitVector()) {
48901 SmallVector<SDValue, 2> LHS, RHS;
48902 for (unsigned i = 0; i != NumOps; ++i) {
48903 LHS.push_back(Ops[i].getOperand(0));
48904 RHS.push_back(Ops[i].getOperand(1));
48905 }
48906 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
48907 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
48908 NumOps * SrcVT.getVectorNumElements());
48909 return DAG.getNode(Op0.getOpcode(), DL, VT,
48910 DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
48911 DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
48912 }
48913 break;
48914 case X86ISD::HADD:
48915 case X86ISD::HSUB:
48916 case X86ISD::FHADD:
48917 case X86ISD::FHSUB:
48918 case X86ISD::PACKSS:
48919 case X86ISD::PACKUS:
48920 if (!IsSplat && VT.is256BitVector() &&
48921 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
48922 SmallVector<SDValue, 2> LHS, RHS;
48923 for (unsigned i = 0; i != NumOps; ++i) {
48924 LHS.push_back(Ops[i].getOperand(0));
48925 RHS.push_back(Ops[i].getOperand(1));
48926 }
48927 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
48928 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
48929 NumOps * SrcVT.getVectorNumElements());
48930 return DAG.getNode(Op0.getOpcode(), DL, VT,
48931 DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
48932 DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
48933 }
48934 break;
48935 case X86ISD::PALIGNR:
48936 if (!IsSplat &&
48937 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
48938 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
48939 llvm::all_of(Ops, [Op0](SDValue Op) {
48940 return Op0.getOperand(2) == Op.getOperand(2);
48941 })) {
48942 SmallVector<SDValue, 2> LHS, RHS;
48943 for (unsigned i = 0; i != NumOps; ++i) {
48944 LHS.push_back(Ops[i].getOperand(0));
48945 RHS.push_back(Ops[i].getOperand(1));
48946 }
48947 return DAG.getNode(Op0.getOpcode(), DL, VT,
48948 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
48949 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
48950 Op0.getOperand(2));
48951 }
48952 break;
48953 }
48954 }
48955
48956 return SDValue();
48957}
48958
48959static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
48960 TargetLowering::DAGCombinerInfo &DCI,
48961 const X86Subtarget &Subtarget) {
48962 EVT VT = N->getValueType(0);
48963 EVT SrcVT = N->getOperand(0).getValueType();
48964 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48965
48966 // Don't do anything for i1 vectors.
48967 if (VT.getVectorElementType() == MVT::i1)
48968 return SDValue();
48969
48970 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
48971 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
48972 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
48973 DCI, Subtarget))
48974 return R;
48975 }
48976
48977 return SDValue();
48978}
48979
48980static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
48981 TargetLowering::DAGCombinerInfo &DCI,
48982 const X86Subtarget &Subtarget) {
48983 if (DCI.isBeforeLegalizeOps())
48984 return SDValue();
48985
48986 MVT OpVT = N->getSimpleValueType(0);
48987
48988 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
48989
48990 SDLoc dl(N);
48991 SDValue Vec = N->getOperand(0);
48992 SDValue SubVec = N->getOperand(1);
48993
48994 uint64_t IdxVal = N->getConstantOperandVal(2);
48995 MVT SubVecVT = SubVec.getSimpleValueType();
48996
48997 if (Vec.isUndef() && SubVec.isUndef())
48998 return DAG.getUNDEF(OpVT);
48999
49000 // Inserting undefs/zeros into zeros/undefs is a zero vector.
49001 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
49002 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
49003 return getZeroVector(OpVT, Subtarget, DAG, dl);
49004
49005 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
49006 // If we're inserting into a zero vector and then into a larger zero vector,
49007 // just insert into the larger zero vector directly.
49008 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
49009 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
49010 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
49011 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
49012 getZeroVector(OpVT, Subtarget, DAG, dl),
49013 SubVec.getOperand(1),
49014 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
49015 }
49016
49017 // If we're inserting into a zero vector and our input was extracted from an
49018 // insert into a zero vector of the same type and the extraction was at
49019 // least as large as the original insertion. Just insert the original
49020 // subvector into a zero vector.
49021 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
49022 isNullConstant(SubVec.getOperand(1)) &&
49023 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
49024 SDValue Ins = SubVec.getOperand(0);
49025 if (isNullConstant(Ins.getOperand(2)) &&
49026 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
49027 Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
49028 SubVecVT.getFixedSizeInBits())
49029 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
49030 getZeroVector(OpVT, Subtarget, DAG, dl),
49031 Ins.getOperand(1), N->getOperand(2));
49032 }
49033 }
49034
49035 // Stop here if this is an i1 vector.
49036 if (IsI1Vector)
49037 return SDValue();
49038
49039 // If this is an insert of an extract, combine to a shuffle. Don't do this
49040 // if the insert or extract can be represented with a subregister operation.
49041 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
49042 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
49043 (IdxVal != 0 ||
49044 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
49045 int ExtIdxVal = SubVec.getConstantOperandVal(1);
49046 if (ExtIdxVal != 0) {
49047 int VecNumElts = OpVT.getVectorNumElements();
49048 int SubVecNumElts = SubVecVT.getVectorNumElements();
49049 SmallVector<int, 64> Mask(VecNumElts);
49050 // First create an identity shuffle mask.
49051 for (int i = 0; i != VecNumElts; ++i)
49052 Mask[i] = i;
49053 // Now insert the extracted portion.
49054 for (int i = 0; i != SubVecNumElts; ++i)
49055 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
49056
49057 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
49058 }
49059 }
49060
49061 // Match concat_vector style patterns.
49062 SmallVector<SDValue, 2> SubVectorOps;
49063 if (collectConcatOps(N, SubVectorOps)) {
49064 if (SDValue Fold =
49065 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
49066 return Fold;
49067
49068 // If we're inserting all zeros into the upper half, change this to
49069 // a concat with zero. We will match this to a move
49070 // with implicit upper bit zeroing during isel.
49071 // We do this here because we don't want combineConcatVectorOps to
49072 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
49073 if (SubVectorOps.size() == 2 &&
49074 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
49075 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
49076 getZeroVector(OpVT, Subtarget, DAG, dl),
49077 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
49078 }
49079
49080 // If this is a broadcast insert into an upper undef, use a larger broadcast.
49081 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
49082 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
49083
49084 // If this is a broadcast load inserted into an upper undef, use a larger
49085 // broadcast load.
49086 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
49087 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
49088 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
49089 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
49090 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
49091 SDValue BcastLd =
49092 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
49093 MemIntr->getMemoryVT(),
49094 MemIntr->getMemOperand());
49095 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
49096 return BcastLd;
49097 }
49098
49099 return SDValue();
49100}
49101
49102/// If we are extracting a subvector of a vector select and the select condition
49103/// is composed of concatenated vectors, try to narrow the select width. This
49104/// is a common pattern for AVX1 integer code because 256-bit selects may be
49105/// legal, but there is almost no integer math/logic available for 256-bit.
49106/// This function should only be called with legal types (otherwise, the calls
49107/// to get simple value types will assert).
49108static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
49109 SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
49110 SmallVector<SDValue, 4> CatOps;
49111 if (Sel.getOpcode() != ISD::VSELECT ||
49112 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
49113 return SDValue();
49114
49115 // Note: We assume simple value types because this should only be called with
49116 // legal operations/types.
49117 // TODO: This can be extended to handle extraction to 256-bits.
49118 MVT VT = Ext->getSimpleValueType(0);
49119 if (!VT.is128BitVector())
49120 return SDValue();
49121
49122 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
49123 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
49124 return SDValue();
49125
49126 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
49127 MVT SelVT = Sel.getSimpleValueType();
49128 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations") ? static_cast
<void> (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49129, __PRETTY_FUNCTION__))
49129 "Unexpected vector type with legal operations")(((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations") ? static_cast
<void> (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49129, __PRETTY_FUNCTION__))
;
49130
49131 unsigned SelElts = SelVT.getVectorNumElements();
49132 unsigned CastedElts = WideVT.getVectorNumElements();
49133 unsigned ExtIdx = Ext->getConstantOperandVal(1);
49134 if (SelElts % CastedElts == 0) {
49135 // The select has the same or more (narrower) elements than the extract
49136 // operand. The extraction index gets scaled by that factor.
49137 ExtIdx *= (SelElts / CastedElts);
49138 } else if (CastedElts % SelElts == 0) {
49139 // The select has less (wider) elements than the extract operand. Make sure
49140 // that the extraction index can be divided evenly.
49141 unsigned IndexDivisor = CastedElts / SelElts;
49142 if (ExtIdx % IndexDivisor != 0)
49143 return SDValue();
49144 ExtIdx /= IndexDivisor;
49145 } else {
49146 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49146)
;
49147 }
49148
49149 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
49150 unsigned NarrowElts = SelElts / NarrowingFactor;
49151 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
49152 SDLoc DL(Ext);
49153 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
49154 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
49155 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
49156 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
49157 return DAG.getBitcast(VT, NarrowSel);
49158}
49159
49160static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
49161 TargetLowering::DAGCombinerInfo &DCI,
49162 const X86Subtarget &Subtarget) {
49163 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
49164 // eventually get combined/lowered into ANDNP) with a concatenated operand,
49165 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
49166 // We let generic combining take over from there to simplify the
49167 // insert/extract and 'not'.
49168 // This pattern emerges during AVX1 legalization. We handle it before lowering
49169 // to avoid complications like splitting constant vector loads.
49170
49171 // Capture the original wide type in the likely case that we need to bitcast
49172 // back to this type.
49173 if (!N->getValueType(0).isSimple())
49174 return SDValue();
49175
49176 MVT VT = N->getSimpleValueType(0);
49177 SDValue InVec = N->getOperand(0);
49178 unsigned IdxVal = N->getConstantOperandVal(1);
49179 SDValue InVecBC = peekThroughBitcasts(InVec);
49180 EVT InVecVT = InVec.getValueType();
49181 unsigned SizeInBits = VT.getSizeInBits();
49182 unsigned InSizeInBits = InVecVT.getSizeInBits();
49183 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49184
49185 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
49186 TLI.isTypeLegal(InVecVT) &&
49187 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
49188 auto isConcatenatedNot = [](SDValue V) {
49189 V = peekThroughBitcasts(V);
49190 if (!isBitwiseNot(V))
49191 return false;
49192 SDValue NotOp = V->getOperand(0);
49193 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
49194 };
49195 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
49196 isConcatenatedNot(InVecBC.getOperand(1))) {
49197 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
49198 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
49199 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
49200 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
49201 }
49202 }
49203
49204 if (DCI.isBeforeLegalizeOps())
49205 return SDValue();
49206
49207 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
49208 return V;
49209
49210 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
49211 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
49212
49213 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
49214 if (VT.getScalarType() == MVT::i1)
49215 return DAG.getConstant(1, SDLoc(N), VT);
49216 return getOnesVector(VT, DAG, SDLoc(N));
49217 }
49218
49219 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
49220 return DAG.getBuildVector(
49221 VT, SDLoc(N),
49222 InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
49223
49224 // If we are extracting from an insert into a zero vector, replace with a
49225 // smaller insert into zero if we don't access less than the original
49226 // subvector. Don't do this for i1 vectors.
49227 if (VT.getVectorElementType() != MVT::i1 &&
49228 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
49229 InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
49230 ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
49231 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
49232 SDLoc DL(N);
49233 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
49234 getZeroVector(VT, Subtarget, DAG, DL),
49235 InVec.getOperand(1), InVec.getOperand(2));
49236 }
49237
49238 // If we're extracting from a broadcast then we're better off just
49239 // broadcasting to the smaller type directly, assuming this is the only use.
49240 // As its a broadcast we don't care about the extraction index.
49241 if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
49242 InVec.getOperand(0).getValueSizeInBits() <= SizeInBits)
49243 return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
49244
49245 if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
49246 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
49247 if (MemIntr->getMemoryVT().getSizeInBits() <= SizeInBits) {
49248 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
49249 SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
49250 SDValue BcastLd =
49251 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
49252 MemIntr->getMemoryVT(),
49253 MemIntr->getMemOperand());
49254 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
49255 return BcastLd;
49256 }
49257 }
49258
49259 // If we're extracting an upper subvector from a broadcast we should just
49260 // extract the lowest subvector instead which should allow
49261 // SimplifyDemandedVectorElts do more simplifications.
49262 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
49263 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
49264 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
49265
49266 // If we're extracting a broadcasted subvector, just use the source.
49267 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST &&
49268 InVec.getOperand(0).getValueType() == VT)
49269 return InVec.getOperand(0);
49270
49271 // Attempt to extract from the source of a shuffle vector.
49272 if ((InSizeInBits % SizeInBits) == 0 &&
49273 (IdxVal % VT.getVectorNumElements()) == 0) {
49274 SmallVector<int, 32> ShuffleMask;
49275 SmallVector<int, 32> ScaledMask;
49276 SmallVector<SDValue, 2> ShuffleInputs;
49277 unsigned NumSubVecs = InSizeInBits / SizeInBits;
49278 // Decode the shuffle mask and scale it so its shuffling subvectors.
49279 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
49280 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
49281 unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
49282 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
49283 return DAG.getUNDEF(VT);
49284 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
49285 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
49286 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
49287 if (Src.getValueSizeInBits() == InSizeInBits) {
49288 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
49289 unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
49290 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
49291 SDLoc(N), SizeInBits);
49292 }
49293 }
49294 }
49295
49296 // If we're extracting the lowest subvector and we're the only user,
49297 // we may be able to perform this with a smaller vector width.
49298 if (IdxVal == 0 && InVec.hasOneUse()) {
49299 unsigned InOpcode = InVec.getOpcode();
49300 if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
49301 // v2f64 CVTDQ2PD(v4i32).
49302 if (InOpcode == ISD::SINT_TO_FP &&
49303 InVec.getOperand(0).getValueType() == MVT::v4i32) {
49304 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
49305 }
49306 // v2f64 CVTUDQ2PD(v4i32).
49307 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
49308 InVec.getOperand(0).getValueType() == MVT::v4i32) {
49309 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
49310 }
49311 // v2f64 CVTPS2PD(v4f32).
49312 if (InOpcode == ISD::FP_EXTEND &&
49313 InVec.getOperand(0).getValueType() == MVT::v4f32) {
49314 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
49315 }
49316 }
49317 if ((InOpcode == ISD::ANY_EXTEND ||
49318 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
49319 InOpcode == ISD::ZERO_EXTEND ||
49320 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
49321 InOpcode == ISD::SIGN_EXTEND ||
49322 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
49323 (SizeInBits == 128 || SizeInBits == 256) &&
49324 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
49325 SDLoc DL(N);
49326 SDValue Ext = InVec.getOperand(0);
49327 if (Ext.getValueSizeInBits() > SizeInBits)
49328 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
49329 unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
49330 return DAG.getNode(ExtOp, DL, VT, Ext);
49331 }
49332 if (InOpcode == ISD::VSELECT &&
49333 InVec.getOperand(0).getValueType().is256BitVector() &&
49334 InVec.getOperand(1).getValueType().is256BitVector() &&
49335 InVec.getOperand(2).getValueType().is256BitVector()) {
49336 SDLoc DL(N);
49337 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
49338 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
49339 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
49340 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
49341 }
49342 if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
49343 (VT.is128BitVector() || VT.is256BitVector())) {
49344 SDLoc DL(N);
49345 SDValue InVecSrc = InVec.getOperand(0);
49346 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
49347 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
49348 return DAG.getNode(InOpcode, DL, VT, Ext);
49349 }
49350 }
49351
49352 return SDValue();
49353}
49354
49355static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
49356 EVT VT = N->getValueType(0);
49357 SDValue Src = N->getOperand(0);
49358 SDLoc DL(N);
49359
49360 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
49361 // This occurs frequently in our masked scalar intrinsic code and our
49362 // floating point select lowering with AVX512.
49363 // TODO: SimplifyDemandedBits instead?
49364 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
49365 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
49366 if (C->getAPIntValue().isOneValue())
49367 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
49368 Src.getOperand(0));
49369
49370 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
49371 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
49372 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
49373 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
49374 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
49375 if (C->isNullValue())
49376 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
49377 Src.getOperand(1));
49378
49379 // Reduce v2i64 to v4i32 if we don't need the upper bits.
49380 // TODO: Move to DAGCombine/SimplifyDemandedBits?
49381 if (VT == MVT::v2i64 || VT == MVT::v2f64) {
49382 auto IsAnyExt64 = [](SDValue Op) {
49383 if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
49384 return SDValue();
49385 if (Op.getOpcode() == ISD::ANY_EXTEND &&
49386 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
49387 return Op.getOperand(0);
49388 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
49389 if (Ld->getExtensionType() == ISD::EXTLOAD &&
49390 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
49391 return Op;
49392 return SDValue();
49393 };
49394 if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
49395 return DAG.getBitcast(
49396 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
49397 DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
49398 }
49399
49400 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
49401 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
49402 Src.getOperand(0).getValueType() == MVT::x86mmx)
49403 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
49404
49405 return SDValue();
49406}
49407
49408// Simplify PMULDQ and PMULUDQ operations.
49409static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
49410 TargetLowering::DAGCombinerInfo &DCI,
49411 const X86Subtarget &Subtarget) {
49412 SDValue LHS = N->getOperand(0);
49413 SDValue RHS = N->getOperand(1);
49414
49415 // Canonicalize constant to RHS.
49416 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
49417 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
49418 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
49419
49420 // Multiply by zero.
49421 // Don't return RHS as it may contain UNDEFs.
49422 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
49423 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
49424
49425 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
49426 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49427 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
49428 return SDValue(N, 0);
49429
49430 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
49431 // convert it to any_extend_invec, due to the LegalOperations check, do the
49432 // conversion directly to a vector shuffle manually. This exposes combine
49433 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
49434 // combineX86ShufflesRecursively on SSE4.1 targets.
49435 // FIXME: This is basically a hack around several other issues related to
49436 // ANY_EXTEND_VECTOR_INREG.
49437 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
49438 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
49439 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
49440 LHS.getOperand(0).getValueType() == MVT::v4i32) {
49441 SDLoc dl(N);
49442 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
49443 LHS.getOperand(0), { 0, -1, 1, -1 });
49444 LHS = DAG.getBitcast(MVT::v2i64, LHS);
49445 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
49446 }
49447 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
49448 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
49449 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
49450 RHS.getOperand(0).getValueType() == MVT::v4i32) {
49451 SDLoc dl(N);
49452 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
49453 RHS.getOperand(0), { 0, -1, 1, -1 });
49454 RHS = DAG.getBitcast(MVT::v2i64, RHS);
49455 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
49456 }
49457
49458 return SDValue();
49459}
49460
49461static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
49462 TargetLowering::DAGCombinerInfo &DCI,
49463 const X86Subtarget &Subtarget) {
49464 EVT VT = N->getValueType(0);
49465 SDValue In = N->getOperand(0);
49466 unsigned Opcode = N->getOpcode();
49467 unsigned InOpcode = In.getOpcode();
49468 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49469
49470 // Try to merge vector loads and extend_inreg to an extload.
49471 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
49472 In.hasOneUse()) {
49473 auto *Ld = cast<LoadSDNode>(In);
49474 if (Ld->isSimple()) {
49475 MVT SVT = In.getSimpleValueType().getVectorElementType();
49476 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
49477 ? ISD::SEXTLOAD
49478 : ISD::ZEXTLOAD;
49479 EVT MemVT =
49480 EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements());
49481 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
49482 SDValue Load =
49483 DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
49484 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
49485 Ld->getMemOperand()->getFlags());
49486 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
49487 return Load;
49488 }
49489 }
49490 }
49491
49492 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
49493 if (Opcode == InOpcode)
49494 return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
49495
49496 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
49497 // -> EXTEND_VECTOR_INREG(X).
49498 // TODO: Handle non-zero subvector indices.
49499 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
49500 In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
49501 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
49502 In.getValueSizeInBits())
49503 return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
49504
49505 // Attempt to combine as a shuffle.
49506 // TODO: General ZERO_EXTEND_VECTOR_INREG support.
49507 if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
49508 (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
49509 SDValue Op(N, 0);
49510 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
49511 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49512 return Res;
49513 }
49514
49515 return SDValue();
49516}
49517
49518static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
49519 TargetLowering::DAGCombinerInfo &DCI) {
49520 EVT VT = N->getValueType(0);
49521
49522 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
49523 return DAG.getConstant(0, SDLoc(N), VT);
49524
49525 APInt KnownUndef, KnownZero;
49526 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49527 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
49528 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
49529 KnownZero, DCI))
49530 return SDValue(N, 0);
49531
49532 return SDValue();
49533}
49534
49535// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
49536// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
49537// extra instructions between the conversion due to going to scalar and back.
49538static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
49539 const X86Subtarget &Subtarget) {
49540 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
49541 return SDValue();
49542
49543 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
49544 return SDValue();
49545
49546 if (N->getValueType(0) != MVT::f32 ||
49547 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
49548 return SDValue();
49549
49550 SDLoc dl(N);
49551 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
49552 N->getOperand(0).getOperand(0));
49553 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
49554 DAG.getTargetConstant(4, dl, MVT::i32));
49555 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
49556 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
49557 DAG.getIntPtrConstant(0, dl));
49558}
49559
49560static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
49561 const X86Subtarget &Subtarget) {
49562 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
49563 return SDValue();
49564
49565 bool IsStrict = N->isStrictFPOpcode();
49566 EVT VT = N->getValueType(0);
49567 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
49568 EVT SrcVT = Src.getValueType();
49569
49570 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
49571 return SDValue();
49572
49573 if (VT.getVectorElementType() != MVT::f32 &&
49574 VT.getVectorElementType() != MVT::f64)
49575 return SDValue();
49576
49577 unsigned NumElts = VT.getVectorNumElements();
49578 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49579 return SDValue();
49580
49581 SDLoc dl(N);
49582
49583 // Convert the input to vXi16.
49584 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
49585 Src = DAG.getBitcast(IntVT, Src);
49586
49587 // Widen to at least 8 input elements.
49588 if (NumElts < 8) {
49589 unsigned NumConcats = 8 / NumElts;
49590 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
49591 : DAG.getConstant(0, dl, IntVT);
49592 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
49593 Ops[0] = Src;
49594 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
49595 }
49596
49597 // Destination is vXf32 with at least 4 elements.
49598 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
49599 std::max(4U, NumElts));
49600 SDValue Cvt, Chain;
49601 if (IsStrict) {
49602 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
49603 {N->getOperand(0), Src});
49604 Chain = Cvt.getValue(1);
49605 } else {
49606 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
49607 }
49608
49609 if (NumElts < 4) {
49610 assert(NumElts == 2 && "Unexpected size")((NumElts == 2 && "Unexpected size") ? static_cast<
void> (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49610, __PRETTY_FUNCTION__))
;
49611 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
49612 DAG.getIntPtrConstant(0, dl));
49613 }
49614
49615 if (IsStrict) {
49616 // Extend to the original VT if necessary.
49617 if (Cvt.getValueType() != VT) {
49618 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
49619 {Chain, Cvt});
49620 Chain = Cvt.getValue(1);
49621 }
49622 return DAG.getMergeValues({Cvt, Chain}, dl);
49623 }
49624
49625 // Extend to the original VT if necessary.
49626 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
49627}
49628
49629// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to
49630// cases where the loads have the same input chain and the output chains are
49631// unused. This avoids any memory ordering issues.
49632static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
49633 TargetLowering::DAGCombinerInfo &DCI) {
49634 // Only do this if the chain result is unused.
49635 if (N->hasAnyUseOfValue(1))
49636 return SDValue();
49637
49638 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
49639
49640 SDValue Ptr = MemIntrin->getBasePtr();
49641 SDValue Chain = MemIntrin->getChain();
49642 EVT VT = N->getSimpleValueType(0);
49643 EVT MemVT = MemIntrin->getMemoryVT();
49644
49645 // Look at other users of our base pointer and try to find a wider broadcast.
49646 // The input chain and the size of the memory VT must match.
49647 for (SDNode *User : Ptr->uses())
49648 if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
49649 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
49650 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
49651 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
49652 MemVT.getSizeInBits() &&
49653 !User->hasAnyUseOfValue(1) &&
49654 User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
49655 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
49656 VT.getSizeInBits());
49657 Extract = DAG.getBitcast(VT, Extract);
49658 return DCI.CombineTo(N, Extract, SDValue(User, 1));
49659 }
49660
49661 return SDValue();
49662}
49663
49664static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
49665 const X86Subtarget &Subtarget) {
49666 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
49667 return SDValue();
49668
49669 EVT VT = N->getValueType(0);
49670 SDValue Src = N->getOperand(0);
49671 EVT SrcVT = Src.getValueType();
49672
49673 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
49674 SrcVT.getVectorElementType() != MVT::f32)
49675 return SDValue();
49676
49677 unsigned NumElts = VT.getVectorNumElements();
49678 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49679 return SDValue();
49680
49681 SDLoc dl(N);
49682
49683 // Widen to at least 4 input elements.
49684 if (NumElts < 4)
49685 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
49686 DAG.getConstantFP(0.0, dl, SrcVT));
49687
49688 // Destination is v8i16 with at least 8 elements.
49689 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49690 std::max(8U, NumElts));
49691 SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
49692 DAG.getTargetConstant(4, dl, MVT::i32));
49693
49694 // Extract down to real number of elements.
49695 if (NumElts < 8) {
49696 EVT IntVT = VT.changeVectorElementTypeToInteger();
49697 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
49698 DAG.getIntPtrConstant(0, dl));
49699 }
49700
49701 return DAG.getBitcast(VT, Cvt);
49702}
49703
49704static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
49705 SDValue Src = N->getOperand(0);
49706
49707 // Turn MOVDQ2Q+simple_load into an mmx load.
49708 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
49709 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
49710
49711 if (LN->isSimple()) {
49712 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
49713 LN->getBasePtr(),
49714 LN->getPointerInfo(),
49715 LN->getOriginalAlign(),
49716 LN->getMemOperand()->getFlags());
49717 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
49718 return NewLd;
49719 }
49720 }
49721
49722 return SDValue();
49723}
49724
49725static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
49726 TargetLowering::DAGCombinerInfo &DCI) {
49727 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
49728 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49729 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
49730 APInt::getAllOnesValue(NumBits), DCI))
49731 return SDValue(N, 0);
49732
49733 return SDValue();
49734}
49735
49736SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
49737 DAGCombinerInfo &DCI) const {
49738 SelectionDAG &DAG = DCI.DAG;
49739 switch (N->getOpcode()) {
49740 default: break;
49741 case ISD::SCALAR_TO_VECTOR:
49742 return combineScalarToVector(N, DAG);
49743 case ISD::EXTRACT_VECTOR_ELT:
49744 case X86ISD::PEXTRW:
49745 case X86ISD::PEXTRB:
49746 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
49747 case ISD::CONCAT_VECTORS:
49748 return combineConcatVectors(N, DAG, DCI, Subtarget);
49749 case ISD::INSERT_SUBVECTOR:
49750 return combineInsertSubvector(N, DAG, DCI, Subtarget);
49751 case ISD::EXTRACT_SUBVECTOR:
49752 return combineExtractSubvector(N, DAG, DCI, Subtarget);
49753 case ISD::VSELECT:
49754 case ISD::SELECT:
49755 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
49756 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
49757 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
49758 case X86ISD::CMP: return combineCMP(N, DAG);
49759 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
49760 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
49761 case X86ISD::ADD:
49762 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
49763 case X86ISD::SBB: return combineSBB(N, DAG);
49764 case X86ISD::ADC: return combineADC(N, DAG, DCI);
49765 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
49766 case ISD::SHL: return combineShiftLeft(N, DAG);
49767 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
49768 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
49769 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
49770 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
49771 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
49772 case X86ISD::BEXTR:
49773 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
49774 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
49775 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
49776 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
49777 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
49778 case X86ISD::VEXTRACT_STORE:
49779 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
49780 case ISD::SINT_TO_FP:
49781 case ISD::STRICT_SINT_TO_FP:
49782 return combineSIntToFP(N, DAG, DCI, Subtarget);
49783 case ISD::UINT_TO_FP:
49784 case ISD::STRICT_UINT_TO_FP:
49785 return combineUIntToFP(N, DAG, Subtarget);
49786 case ISD::FADD:
49787 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
49788 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
49789 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
49790 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
49791 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
49792 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
49793 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
49794 case X86ISD::FXOR:
49795 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
49796 case X86ISD::FMIN:
49797 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
49798 case ISD::FMINNUM:
49799 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
49800 case X86ISD::CVTSI2P:
49801 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
49802 case X86ISD::CVTP2SI:
49803 case X86ISD::CVTP2UI:
49804 case X86ISD::STRICT_CVTTP2SI:
49805 case X86ISD::CVTTP2SI:
49806 case X86ISD::STRICT_CVTTP2UI:
49807 case X86ISD::CVTTP2UI:
49808 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
49809 case X86ISD::STRICT_CVTPH2PS:
49810 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
49811 case X86ISD::BT: return combineBT(N, DAG, DCI);
49812 case ISD::ANY_EXTEND:
49813 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
49814 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
49815 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
49816 case ISD::ANY_EXTEND_VECTOR_INREG:
49817 case ISD::SIGN_EXTEND_VECTOR_INREG:
49818 case ISD::ZERO_EXTEND_VECTOR_INREG:
49819 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
49820 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
49821 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
49822 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
49823 case X86ISD::PACKSS:
49824 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
49825 case X86ISD::HADD:
49826 case X86ISD::HSUB:
49827 case X86ISD::FHADD:
49828 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
49829 case X86ISD::VSHL:
49830 case X86ISD::VSRA:
49831 case X86ISD::VSRL:
49832 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
49833 case X86ISD::VSHLI:
49834 case X86ISD::VSRAI:
49835 case X86ISD::VSRLI:
49836 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
49837 case ISD::INSERT_VECTOR_ELT:
49838 case X86ISD::PINSRB:
49839 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
49840 case X86ISD::SHUFP: // Handle all target specific shuffles
49841 case X86ISD::INSERTPS:
49842 case X86ISD::EXTRQI:
49843 case X86ISD::INSERTQI:
49844 case X86ISD::VALIGN:
49845 case X86ISD::PALIGNR:
49846 case X86ISD::VSHLDQ:
49847 case X86ISD::VSRLDQ:
49848 case X86ISD::BLENDI:
49849 case X86ISD::UNPCKH:
49850 case X86ISD::UNPCKL:
49851 case X86ISD::MOVHLPS:
49852 case X86ISD::MOVLHPS:
49853 case X86ISD::PSHUFB:
49854 case X86ISD::PSHUFD:
49855 case X86ISD::PSHUFHW:
49856 case X86ISD::PSHUFLW:
49857 case X86ISD::MOVSHDUP:
49858 case X86ISD::MOVSLDUP:
49859 case X86ISD::MOVDDUP:
49860 case X86ISD::MOVSS:
49861 case X86ISD::MOVSD:
49862 case X86ISD::VBROADCAST:
49863 case X86ISD::VPPERM:
49864 case X86ISD::VPERMI:
49865 case X86ISD::VPERMV:
49866 case X86ISD::VPERMV3:
49867 case X86ISD::VPERMIL2:
49868 case X86ISD::VPERMILPI:
49869 case X86ISD::VPERMILPV:
49870 case X86ISD::VPERM2X128:
49871 case X86ISD::SHUF128:
49872 case X86ISD::VZEXT_MOVL:
49873 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
49874 case X86ISD::FMADD_RND:
49875 case X86ISD::FMSUB:
49876 case X86ISD::STRICT_FMSUB:
49877 case X86ISD::FMSUB_RND:
49878 case X86ISD::FNMADD:
49879 case X86ISD::STRICT_FNMADD:
49880 case X86ISD::FNMADD_RND:
49881 case X86ISD::FNMSUB:
49882 case X86ISD::STRICT_FNMSUB:
49883 case X86ISD::FNMSUB_RND:
49884 case ISD::FMA:
49885 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
49886 case X86ISD::FMADDSUB_RND:
49887 case X86ISD::FMSUBADD_RND:
49888 case X86ISD::FMADDSUB:
49889 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
49890 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
49891 case X86ISD::MGATHER:
49892 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
49893 case ISD::MGATHER:
49894 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
49895 case X86ISD::PCMPEQ:
49896 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
49897 case X86ISD::PMULDQ:
49898 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
49899 case X86ISD::KSHIFTL:
49900 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
49901 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
49902 case ISD::STRICT_FP_EXTEND:
49903 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
49904 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
49905 case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
49906 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
49907 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
49908 }
49909
49910 return SDValue();
49911}
49912
49913bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
49914 if (!isTypeLegal(VT))
49915 return false;
49916
49917 // There are no vXi8 shifts.
49918 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
49919 return false;
49920
49921 // TODO: Almost no 8-bit ops are desirable because they have no actual
49922 // size/speed advantages vs. 32-bit ops, but they do have a major
49923 // potential disadvantage by causing partial register stalls.
49924 //
49925 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
49926 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
49927 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
49928 // check for a constant operand to the multiply.
49929 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
49930 return false;
49931
49932 // i16 instruction encodings are longer and some i16 instructions are slow,
49933 // so those are not desirable.
49934 if (VT == MVT::i16) {
49935 switch (Opc) {
49936 default:
49937 break;
49938 case ISD::LOAD:
49939 case ISD::SIGN_EXTEND:
49940 case ISD::ZERO_EXTEND:
49941 case ISD::ANY_EXTEND:
49942 case ISD::SHL:
49943 case ISD::SRA:
49944 case ISD::SRL:
49945 case ISD::SUB:
49946 case ISD::ADD:
49947 case ISD::MUL:
49948 case ISD::AND:
49949 case ISD::OR:
49950 case ISD::XOR:
49951 return false;
49952 }
49953 }
49954
49955 // Any legal type not explicitly accounted for above here is desirable.
49956 return true;
49957}
49958
49959SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
49960 SDValue Value, SDValue Addr,
49961 SelectionDAG &DAG) const {
49962 const Module *M = DAG.getMachineFunction().getMMI().getModule();
49963 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
49964 if (IsCFProtectionSupported) {
49965 // In case control-flow branch protection is enabled, we need to add
49966 // notrack prefix to the indirect branch.
49967 // In order to do that we create NT_BRIND SDNode.
49968 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
49969 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
49970 }
49971
49972 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
49973}
49974
49975bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
49976 EVT VT = Op.getValueType();
49977 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
49978 isa<ConstantSDNode>(Op.getOperand(1));
49979
49980 // i16 is legal, but undesirable since i16 instruction encodings are longer
49981 // and some i16 instructions are slow.
49982 // 8-bit multiply-by-constant can usually be expanded to something cheaper
49983 // using LEA and/or other ALU ops.
49984 if (VT != MVT::i16 && !Is8BitMulByConstant)
49985 return false;
49986
49987 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
49988 if (!Op.hasOneUse())
49989 return false;
49990 SDNode *User = *Op->use_begin();
49991 if (!ISD::isNormalStore(User))
49992 return false;
49993 auto *Ld = cast<LoadSDNode>(Load);
49994 auto *St = cast<StoreSDNode>(User);
49995 return Ld->getBasePtr() == St->getBasePtr();
49996 };
49997
49998 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
49999 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
50000 return false;
50001 if (!Op.hasOneUse())
50002 return false;
50003 SDNode *User = *Op->use_begin();
50004 if (User->getOpcode() != ISD::ATOMIC_STORE)
50005 return false;
50006 auto *Ld = cast<AtomicSDNode>(Load);
50007 auto *St = cast<AtomicSDNode>(User);
50008 return Ld->getBasePtr() == St->getBasePtr();
50009 };
50010
50011 bool Commute = false;
50012 switch (Op.getOpcode()) {
50013 default: return false;
50014 case ISD::SIGN_EXTEND:
50015 case ISD::ZERO_EXTEND:
50016 case ISD::ANY_EXTEND:
50017 break;
50018 case ISD::SHL:
50019 case ISD::SRA:
50020 case ISD::SRL: {
50021 SDValue N0 = Op.getOperand(0);
50022 // Look out for (store (shl (load), x)).
50023 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
50024 return false;
50025 break;
50026 }
50027 case ISD::ADD:
50028 case ISD::MUL:
50029 case ISD::AND:
50030 case ISD::OR:
50031 case ISD::XOR:
50032 Commute = true;
50033 LLVM_FALLTHROUGH[[gnu::fallthrough]];
50034 case ISD::SUB: {
50035 SDValue N0 = Op.getOperand(0);
50036 SDValue N1 = Op.getOperand(1);
50037 // Avoid disabling potential load folding opportunities.
50038 if (MayFoldLoad(N1) &&
50039 (!Commute || !isa<ConstantSDNode>(N0) ||
50040 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
50041 return false;
50042 if (MayFoldLoad(N0) &&
50043 ((Commute && !isa<ConstantSDNode>(N1)) ||
50044 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
50045 return false;
50046 if (IsFoldableAtomicRMW(N0, Op) ||
50047 (Commute && IsFoldableAtomicRMW(N1, Op)))
50048 return false;
50049 }
50050 }
50051
50052 PVT = MVT::i32;
50053 return true;
50054}
50055
50056//===----------------------------------------------------------------------===//
50057// X86 Inline Assembly Support
50058//===----------------------------------------------------------------------===//
50059
50060// Helper to match a string separated by whitespace.
50061static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
50062 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
50063
50064 for (StringRef Piece : Pieces) {
50065 if (!S.startswith(Piece)) // Check if the piece matches.
50066 return false;
50067
50068 S = S.substr(Piece.size());
50069 StringRef::size_type Pos = S.find_first_not_of(" \t");
50070 if (Pos == 0) // We matched a prefix.
50071 return false;
50072
50073 S = S.substr(Pos);
50074 }
50075
50076 return S.empty();
50077}
50078
50079static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
50080
50081 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
50082 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
50083 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
50084 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
50085
50086 if (AsmPieces.size() == 3)
50087 return true;
50088 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
50089 return true;
50090 }
50091 }
50092 return false;
50093}
50094
50095bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
50096 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
50097
50098 const std::string &AsmStr = IA->getAsmString();
50099
50100 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
50101 if (!Ty || Ty->getBitWidth() % 16 != 0)
50102 return false;
50103
50104 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
50105 SmallVector<StringRef, 4> AsmPieces;
50106 SplitString(AsmStr, AsmPieces, ";\n");
50107
50108 switch (AsmPieces.size()) {
50109 default: return false;
50110 case 1:
50111 // FIXME: this should verify that we are targeting a 486 or better. If not,
50112 // we will turn this bswap into something that will be lowered to logical
50113 // ops instead of emitting the bswap asm. For now, we don't support 486 or
50114 // lower so don't worry about this.
50115 // bswap $0
50116 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
50117 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
50118 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
50119 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
50120 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
50121 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
50122 // No need to check constraints, nothing other than the equivalent of
50123 // "=r,0" would be valid here.
50124 return IntrinsicLowering::LowerToByteSwap(CI);
50125 }
50126
50127 // rorw $$8, ${0:w} --> llvm.bswap.i16
50128 if (CI->getType()->isIntegerTy(16) &&
50129 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
50130 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
50131 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
50132 AsmPieces.clear();
50133 StringRef ConstraintsStr = IA->getConstraintString();
50134 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
50135 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
50136 if (clobbersFlagRegisters(AsmPieces))
50137 return IntrinsicLowering::LowerToByteSwap(CI);
50138 }
50139 break;
50140 case 3:
50141 if (CI->getType()->isIntegerTy(32) &&
50142 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
50143 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
50144 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
50145 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
50146 AsmPieces.clear();
50147 StringRef ConstraintsStr = IA->getConstraintString();
50148 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
50149 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
50150 if (clobbersFlagRegisters(AsmPieces))
50151 return IntrinsicLowering::LowerToByteSwap(CI);
50152 }
50153
50154 if (CI->getType()->isIntegerTy(64)) {
50155 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
50156 if (Constraints.size() >= 2 &&
50157 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
50158 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
50159 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
50160 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
50161 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
50162 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
50163 return IntrinsicLowering::LowerToByteSwap(CI);
50164 }
50165 }
50166 break;
50167 }
50168 return false;
50169}
50170
50171static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
50172 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
50173 .Case("{@cca}", X86::COND_A)
50174 .Case("{@ccae}", X86::COND_AE)
50175 .Case("{@ccb}", X86::COND_B)
50176 .Case("{@ccbe}", X86::COND_BE)
50177 .Case("{@ccc}", X86::COND_B)
50178 .Case("{@cce}", X86::COND_E)
50179 .Case("{@ccz}", X86::COND_E)
50180 .Case("{@ccg}", X86::COND_G)
50181 .Case("{@ccge}", X86::COND_GE)
50182 .Case("{@ccl}", X86::COND_L)
50183 .Case("{@ccle}", X86::COND_LE)
50184 .Case("{@ccna}", X86::COND_BE)
50185 .Case("{@ccnae}", X86::COND_B)
50186 .Case("{@ccnb}", X86::COND_AE)
50187 .Case("{@ccnbe}", X86::COND_A)
50188 .Case("{@ccnc}", X86::COND_AE)
50189 .Case("{@ccne}", X86::COND_NE)
50190 .Case("{@ccnz}", X86::COND_NE)
50191 .Case("{@ccng}", X86::COND_LE)
50192 .Case("{@ccnge}", X86::COND_L)
50193 .Case("{@ccnl}", X86::COND_GE)
50194 .Case("{@ccnle}", X86::COND_G)
50195 .Case("{@ccno}", X86::COND_NO)
50196 .Case("{@ccnp}", X86::COND_NP)
50197 .Case("{@ccns}", X86::COND_NS)
50198 .Case("{@cco}", X86::COND_O)
50199 .Case("{@ccp}", X86::COND_P)
50200 .Case("{@ccs}", X86::COND_S)
50201 .Default(X86::COND_INVALID);
50202 return Cond;
50203}
50204
50205/// Given a constraint letter, return the type of constraint for this target.
50206X86TargetLowering::ConstraintType
50207X86TargetLowering::getConstraintType(StringRef Constraint) const {
50208 if (Constraint.size() == 1) {
50209 switch (Constraint[0]) {
50210 case 'R':
50211 case 'q':
50212 case 'Q':
50213 case 'f':
50214 case 't':
50215 case 'u':
50216 case 'y':
50217 case 'x':
50218 case 'v':
50219 case 'l':
50220 case 'k': // AVX512 masking registers.
50221 return C_RegisterClass;
50222 case 'a':
50223 case 'b':
50224 case 'c':
50225 case 'd':
50226 case 'S':
50227 case 'D':
50228 case 'A':
50229 return C_Register;
50230 case 'I':
50231 case 'J':
50232 case 'K':
50233 case 'N':
50234 case 'G':
50235 case 'L':
50236 case 'M':
50237 return C_Immediate;
50238 case 'C':
50239 case 'e':
50240 case 'Z':
50241 return C_Other;
50242 default:
50243 break;
50244 }
50245 }
50246 else if (Constraint.size() == 2) {
50247 switch (Constraint[0]) {
50248 default:
50249 break;
50250 case 'Y':
50251 switch (Constraint[1]) {
50252 default:
50253 break;
50254 case 'z':
50255 return C_Register;
50256 case 'i':
50257 case 'm':
50258 case 'k':
50259 case 't':
50260 case '2':
50261 return C_RegisterClass;
50262 }
50263 }
50264 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
50265 return C_Other;
50266 return TargetLowering::getConstraintType(Constraint);
50267}
50268
50269/// Examine constraint type and operand type and determine a weight value.
50270/// This object must already have been set up with the operand type
50271/// and the current alternative constraint selected.
50272TargetLowering::ConstraintWeight
50273 X86TargetLowering::getSingleConstraintMatchWeight(
50274 AsmOperandInfo &info, const char *constraint) const {
50275 ConstraintWeight weight = CW_Invalid;
50276 Value *CallOperandVal = info.CallOperandVal;
50277 // If we don't have a value, we can't do a match,
50278 // but allow it at the lowest weight.
50279 if (!CallOperandVal)
50280 return CW_Default;
50281 Type *type = CallOperandVal->getType();
50282 // Look at the constraint type.
50283 switch (*constraint) {
50284 default:
50285 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
50286 LLVM_FALLTHROUGH[[gnu::fallthrough]];
50287 case 'R':
50288 case 'q':
50289 case 'Q':
50290 case 'a':
50291 case 'b':
50292 case 'c':
50293 case 'd':
50294 case 'S':
50295 case 'D':
50296 case 'A':
50297 if (CallOperandVal->getType()->isIntegerTy())
50298 weight = CW_SpecificReg;
50299 break;
50300 case 'f':
50301 case 't':
50302 case 'u':
50303 if (type->isFloatingPointTy())
50304 weight = CW_SpecificReg;
50305 break;
50306 case 'y':
50307 if (type->isX86_MMXTy() && Subtarget.hasMMX())
50308 weight = CW_SpecificReg;
50309 break;
50310 case 'Y':
50311 if (StringRef(constraint).size() != 2)
50312 break;
50313 switch (constraint[1]) {
50314 default:
50315 return CW_Invalid;
50316 // XMM0
50317 case 'z':
50318 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
50319 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
50320 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
50321 return CW_SpecificReg;
50322 return CW_Invalid;
50323 // Conditional OpMask regs (AVX512)
50324 case 'k':
50325 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
50326 return CW_Register;
50327 return CW_Invalid;
50328 // Any MMX reg
50329 case 'm':
50330 if (type->isX86_MMXTy() && Subtarget.hasMMX())
50331 return weight;
50332 return CW_Invalid;
50333 // Any SSE reg when ISA >= SSE2, same as 'x'
50334 case 'i':
50335 case 't':
50336 case '2':
50337 if (!Subtarget.hasSSE2())
50338 return CW_Invalid;
50339 break;
50340 }
50341 break;
50342 case 'v':
50343 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
50344 weight = CW_Register;
50345 LLVM_FALLTHROUGH[[gnu::fallthrough]];
50346 case 'x':
50347 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
50348 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
50349 weight = CW_Register;
50350 break;
50351 case 'k':
50352 // Enable conditional vector operations using %k<#> registers.
50353 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
50354 weight = CW_Register;
50355 break;
50356 case 'I':
50357 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
50358 if (C->getZExtValue() <= 31)
50359 weight = CW_Constant;
50360 }
50361 break;
50362 case 'J':
50363 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
50364 if (C->getZExtValue() <= 63)
50365 weight = CW_Constant;
50366 }
50367 break;
50368 case 'K':
50369 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
50370 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
50371 weight = CW_Constant;
50372 }
50373 break;
50374 case 'L':
50375 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
50376 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
50377 weight = CW_Constant;
50378 }
50379 break;
50380 case 'M':
50381 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
50382 if (C->getZExtValue() <= 3)
50383 weight = CW_Constant;
50384 }
50385 break;
50386 case 'N':
50387 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
50388 if (C->getZExtValue() <= 0xff)
50389 weight = CW_Constant;
50390 }
50391 break;
50392 case 'G':
50393 case 'C':
50394 if (isa<ConstantFP>(CallOperandVal)) {
50395 weight = CW_Constant;
50396 }
50397 break;
50398 case 'e':
50399 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
50400 if ((C->getSExtValue() >= -0x80000000LL) &&
50401 (C->getSExtValue() <= 0x7fffffffLL))
50402 weight = CW_Constant;
50403 }
50404 break;
50405 case 'Z':
50406 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
50407 if (C->getZExtValue() <= 0xffffffff)
50408 weight = CW_Constant;
50409 }
50410 break;
50411 }
50412 return weight;
50413}
50414
50415/// Try to replace an X constraint, which matches anything, with another that
50416/// has more specific requirements based on the type of the corresponding
50417/// operand.
50418const char *X86TargetLowering::
50419LowerXConstraint(EVT ConstraintVT) const {
50420 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
50421 // 'f' like normal targets.
50422 if (ConstraintVT.isFloatingPoint()) {
50423 if (Subtarget.hasSSE1())
50424 return "x";
50425 }
50426
50427 return TargetLowering::LowerXConstraint(ConstraintVT);
50428}
50429
50430// Lower @cc targets via setcc.
50431SDValue X86TargetLowering::LowerAsmOutputForConstraint(
50432 SDValue &Chain, SDValue &Flag, const SDLoc &DL,
50433 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
50434 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
50435 if (Cond == X86::COND_INVALID)
50436 return SDValue();
50437 // Check that return type is valid.
50438 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
50439 OpInfo.ConstraintVT.getSizeInBits() < 8)
50440 report_fatal_error("Flag output operand is of invalid type");
50441
50442 // Get EFLAGS register. Only update chain when copyfrom is glued.
50443 if (Flag.getNode()) {
50444 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
50445 Chain = Flag.getValue(1);
50446 } else
50447 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
50448 // Extract CC code.
50449 SDValue CC = getSETCC(Cond, Flag, DL, DAG);
50450 // Extend to 32-bits
50451 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
50452
50453 return Result;
50454}
50455
50456/// Lower the specified operand into the Ops vector.
50457/// If it is invalid, don't add anything to Ops.
50458void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
50459 std::string &Constraint,
50460 std::vector<SDValue>&Ops,
50461 SelectionDAG &DAG) const {
50462 SDValue Result;
50463
50464 // Only support length 1 constraints for now.
50465 if (Constraint.length() > 1) return;
50466
50467 char ConstraintLetter = Constraint[0];
50468 switch (ConstraintLetter) {
50469 default: break;
50470 case 'I':
50471 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
50472 if (C->getZExtValue() <= 31) {
50473 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
50474 Op.getValueType());
50475 break;
50476 }
50477 }
50478 return;
50479 case 'J':
50480 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
50481 if (C->getZExtValue() <= 63) {
50482 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
50483 Op.getValueType());
50484 break;
50485 }
50486 }
50487 return;
50488 case 'K':
50489 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
50490 if (isInt<8>(C->getSExtValue())) {
50491 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
50492 Op.getValueType());
50493 break;
50494 }
50495 }
50496 return;
50497 case 'L':
50498 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
50499 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
50500 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
50501 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
50502 Op.getValueType());
50503 break;
50504 }
50505 }
50506 return;
50507 case 'M':
50508 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
50509 if (C->getZExtValue() <= 3) {
50510 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
50511 Op.getValueType());
50512 break;
50513 }
50514 }
50515 return;
50516 case 'N':
50517 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
50518 if (C->getZExtValue() <= 255) {
50519 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
50520 Op.getValueType());
50521 break;
50522 }
50523 }
50524 return;
50525 case 'O':
50526 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
50527 if (C->getZExtValue() <= 127) {
50528 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
50529 Op.getValueType());
50530 break;
50531 }
50532 }
50533 return;
50534 case 'e': {
50535 // 32-bit signed value
50536 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
50537 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
50538 C->getSExtValue())) {
50539 // Widen to 64 bits here to get it sign extended.
50540 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
50541 break;
50542 }
50543 // FIXME gcc accepts some relocatable values here too, but only in certain
50544 // memory models; it's complicated.
50545 }
50546 return;
50547 }
50548 case 'Z': {
50549 // 32-bit unsigned value
50550 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
50551 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
50552 C->getZExtValue())) {
50553 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
50554 Op.getValueType());
50555 break;
50556 }
50557 }
50558 // FIXME gcc accepts some relocatable values here too, but only in certain
50559 // memory models; it's complicated.
50560 return;
50561 }
50562 case 'i': {
50563 // Literal immediates are always ok.
50564 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
50565 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
50566 BooleanContent BCont = getBooleanContents(MVT::i64);
50567 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
50568 : ISD::SIGN_EXTEND;
50569 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
50570 : CST->getSExtValue();
50571 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
50572 break;
50573 }
50574
50575 // In any sort of PIC mode addresses need to be computed at runtime by
50576 // adding in a register or some sort of table lookup. These can't
50577 // be used as immediates.
50578 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
50579 return;
50580
50581 // If we are in non-pic codegen mode, we allow the address of a global (with
50582 // an optional displacement) to be used with 'i'.
50583 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
50584 // If we require an extra load to get this address, as in PIC mode, we
50585 // can't accept it.
50586 if (isGlobalStubReference(
50587 Subtarget.classifyGlobalReference(GA->getGlobal())))
50588 return;
50589 break;
50590 }
50591 }
50592
50593 if (Result.getNode()) {
50594 Ops.push_back(Result);
50595 return;
50596 }
50597 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
50598}
50599
50600/// Check if \p RC is a general purpose register class.
50601/// I.e., GR* or one of their variant.
50602static bool isGRClass(const TargetRegisterClass &RC) {
50603 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
50604 RC.hasSuperClassEq(&X86::GR16RegClass) ||
50605 RC.hasSuperClassEq(&X86::GR32RegClass) ||
50606 RC.hasSuperClassEq(&X86::GR64RegClass) ||
50607 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
50608}
50609
50610/// Check if \p RC is a vector register class.
50611/// I.e., FR* / VR* or one of their variant.
50612static bool isFRClass(const TargetRegisterClass &RC) {
50613 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
50614 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
50615 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
50616 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
50617 RC.hasSuperClassEq(&X86::VR512RegClass);
50618}
50619
50620/// Check if \p RC is a mask register class.
50621/// I.e., VK* or one of their variant.
50622static bool isVKClass(const TargetRegisterClass &RC) {
50623 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
50624 RC.hasSuperClassEq(&X86::VK2RegClass) ||
50625 RC.hasSuperClassEq(&X86::VK4RegClass) ||
50626 RC.hasSuperClassEq(&X86::VK8RegClass) ||
50627 RC.hasSuperClassEq(&X86::VK16RegClass) ||
50628 RC.hasSuperClassEq(&X86::VK32RegClass) ||
50629 RC.hasSuperClassEq(&X86::VK64RegClass);
50630}
50631
50632std::pair<unsigned, const TargetRegisterClass *>
50633X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
50634 StringRef Constraint,
50635 MVT VT) const {
50636 // First, see if this is a constraint that directly corresponds to an LLVM
50637 // register class.
50638 if (Constraint.size() == 1) {
50639 // GCC Constraint Letters
50640 switch (Constraint[0]) {
50641 default: break;
50642 // 'A' means [ER]AX + [ER]DX.
50643 case 'A':
50644 if (Subtarget.is64Bit())
50645 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
50646 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50647, __PRETTY_FUNCTION__))
50647 "Expecting 64, 32 or 16 bit subtarget")(((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50647, __PRETTY_FUNCTION__))
;
50648 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
50649
50650 // TODO: Slight differences here in allocation order and leaving
50651 // RIP in the class. Do they matter any more here than they do
50652 // in the normal allocation?
50653 case 'k':
50654 if (Subtarget.hasAVX512()) {
50655 if (VT == MVT::i1)
50656 return std::make_pair(0U, &X86::VK1RegClass);
50657 if (VT == MVT::i8)
50658 return std::make_pair(0U, &X86::VK8RegClass);
50659 if (VT == MVT::i16)
50660 return std::make_pair(0U, &X86::VK16RegClass);
50661 }
50662 if (Subtarget.hasBWI()) {
50663 if (VT == MVT::i32)
50664 return std::make_pair(0U, &X86::VK32RegClass);
50665 if (VT == MVT::i64)
50666 return std::make_pair(0U, &X86::VK64RegClass);
50667 }
50668 break;
50669 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
50670 if (Subtarget.is64Bit()) {
50671 if (VT == MVT::i8 || VT == MVT::i1)
50672 return std::make_pair(0U, &X86::GR8RegClass);
50673 if (VT == MVT::i16)
50674 return std::make_pair(0U, &X86::GR16RegClass);
50675 if (VT == MVT::i32 || VT == MVT::f32)
50676 return std::make_pair(0U, &X86::GR32RegClass);
50677 if (VT != MVT::f80)
50678 return std::make_pair(0U, &X86::GR64RegClass);
50679 break;
50680 }
50681 LLVM_FALLTHROUGH[[gnu::fallthrough]];
50682 // 32-bit fallthrough
50683 case 'Q': // Q_REGS
50684 if (VT == MVT::i8 || VT == MVT::i1)
50685 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
50686 if (VT == MVT::i16)
50687 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
50688 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
50689 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
50690 if (VT != MVT::f80)
50691 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
50692 break;
50693 case 'r': // GENERAL_REGS
50694 case 'l': // INDEX_REGS
50695 if (VT == MVT::i8 || VT == MVT::i1)
50696 return std::make_pair(0U, &X86::GR8RegClass);
50697 if (VT == MVT::i16)
50698 return std::make_pair(0U, &X86::GR16RegClass);
50699 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
50700 return std::make_pair(0U, &X86::GR32RegClass);
50701 if (VT != MVT::f80)
50702 return std::make_pair(0U, &X86::GR64RegClass);
50703 break;
50704 case 'R': // LEGACY_REGS
50705 if (VT == MVT::i8 || VT == MVT::i1)
50706 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
50707 if (VT == MVT::i16)
50708 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
50709 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
50710 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
50711 if (VT != MVT::f80)
50712 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
50713 break;
50714 case 'f': // FP Stack registers.
50715 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
50716 // value to the correct fpstack register class.
50717 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
50718 return std::make_pair(0U, &X86::RFP32RegClass);
50719 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
50720 return std::make_pair(0U, &X86::RFP64RegClass);
50721 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
50722 return std::make_pair(0U, &X86::RFP80RegClass);
50723 break;
50724 case 'y': // MMX_REGS if MMX allowed.
50725 if (!Subtarget.hasMMX()) break;
50726 return std::make_pair(0U, &X86::VR64RegClass);
50727 case 'v':
50728 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
50729 if (!Subtarget.hasSSE1()) break;
50730 bool VConstraint = (Constraint[0] == 'v');
50731
50732 switch (VT.SimpleTy) {
50733 default: break;
50734 // Scalar SSE types.
50735 case MVT::f32:
50736 case MVT::i32:
50737 if (VConstraint && Subtarget.hasVLX())
50738 return std::make_pair(0U, &X86::FR32XRegClass);
50739 return std::make_pair(0U, &X86::FR32RegClass);
50740 case MVT::f64:
50741 case MVT::i64:
50742 if (VConstraint && Subtarget.hasVLX())
50743 return std::make_pair(0U, &X86::FR64XRegClass);
50744 return std::make_pair(0U, &X86::FR64RegClass);
50745 case MVT::i128:
50746 if (Subtarget.is64Bit()) {
50747 if (VConstraint && Subtarget.hasVLX())
50748 return std::make_pair(0U, &X86::VR128XRegClass);
50749 return std::make_pair(0U, &X86::VR128RegClass);
50750 }
50751 break;
50752 // Vector types and fp128.
50753 case MVT::f128:
50754 case MVT::v16i8:
50755 case MVT::v8i16:
50756 case MVT::v4i32:
50757 case MVT::v2i64:
50758 case MVT::v4f32:
50759 case MVT::v2f64:
50760 if (VConstraint && Subtarget.hasVLX())
50761 return std::make_pair(0U, &X86::VR128XRegClass);
50762 return std::make_pair(0U, &X86::VR128RegClass);
50763 // AVX types.
50764 case MVT::v32i8:
50765 case MVT::v16i16:
50766 case MVT::v8i32:
50767 case MVT::v4i64:
50768 case MVT::v8f32:
50769 case MVT::v4f64:
50770 if (VConstraint && Subtarget.hasVLX())
50771 return std::make_pair(0U, &X86::VR256XRegClass);
50772 if (Subtarget.hasAVX())
50773 return std::make_pair(0U, &X86::VR256RegClass);
50774 break;
50775 case MVT::v64i8:
50776 case MVT::v32i16:
50777 case MVT::v8f64:
50778 case MVT::v16f32:
50779 case MVT::v16i32:
50780 case MVT::v8i64:
50781 if (!Subtarget.hasAVX512()) break;
50782 if (VConstraint)
50783 return std::make_pair(0U, &X86::VR512RegClass);
50784 return std::make_pair(0U, &X86::VR512_0_15RegClass);
50785 }
50786 break;
50787 }
50788 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
50789 switch (Constraint[1]) {
50790 default:
50791 break;
50792 case 'i':
50793 case 't':
50794 case '2':
50795 return getRegForInlineAsmConstraint(TRI, "x", VT);
50796 case 'm':
50797 if (!Subtarget.hasMMX()) break;
50798 return std::make_pair(0U, &X86::VR64RegClass);
50799 case 'z':
50800 if (!Subtarget.hasSSE1()) break;
50801 switch (VT.SimpleTy) {
50802 default: break;
50803 // Scalar SSE types.
50804 case MVT::f32:
50805 case MVT::i32:
50806 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
50807 case MVT::f64:
50808 case MVT::i64:
50809 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
50810 case MVT::f128:
50811 case MVT::v16i8:
50812 case MVT::v8i16:
50813 case MVT::v4i32:
50814 case MVT::v2i64:
50815 case MVT::v4f32:
50816 case MVT::v2f64:
50817 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
50818 // AVX types.
50819 case MVT::v32i8:
50820 case MVT::v16i16:
50821 case MVT::v8i32:
50822 case MVT::v4i64:
50823 case MVT::v8f32:
50824 case MVT::v4f64:
50825 if (Subtarget.hasAVX())
50826 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
50827 break;
50828 case MVT::v64i8:
50829 case MVT::v32i16:
50830 case MVT::v8f64:
50831 case MVT::v16f32:
50832 case MVT::v16i32:
50833 case MVT::v8i64:
50834 if (Subtarget.hasAVX512())
50835 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
50836 break;
50837 }
50838 break;
50839 case 'k':
50840 // This register class doesn't allocate k0 for masked vector operation.
50841 if (Subtarget.hasAVX512()) {
50842 if (VT == MVT::i1)
50843 return std::make_pair(0U, &X86::VK1WMRegClass);
50844 if (VT == MVT::i8)
50845 return std::make_pair(0U, &X86::VK8WMRegClass);
50846 if (VT == MVT::i16)
50847 return std::make_pair(0U, &X86::VK16WMRegClass);
50848 }
50849 if (Subtarget.hasBWI()) {
50850 if (VT == MVT::i32)
50851 return std::make_pair(0U, &X86::VK32WMRegClass);
50852 if (VT == MVT::i64)
50853 return std::make_pair(0U, &X86::VK64WMRegClass);
50854 }
50855 break;
50856 }
50857 }
50858
50859 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
50860 return std::make_pair(0U, &X86::GR32RegClass);
50861
50862 // Use the default implementation in TargetLowering to convert the register
50863 // constraint into a member of a register class.
50864 std::pair<Register, const TargetRegisterClass*> Res;
50865 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
50866
50867 // Not found as a standard register?
50868 if (!Res.second) {
50869 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
50870 // to/from f80.
50871 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
50872 // Map st(0) -> st(7) -> ST0
50873 if (Constraint.size() == 7 && Constraint[0] == '{' &&
50874 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
50875 Constraint[3] == '(' &&
50876 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
50877 Constraint[5] == ')' && Constraint[6] == '}') {
50878 // st(7) is not allocatable and thus not a member of RFP80. Return
50879 // singleton class in cases where we have a reference to it.
50880 if (Constraint[4] == '7')
50881 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
50882 return std::make_pair(X86::FP0 + Constraint[4] - '0',
50883 &X86::RFP80RegClass);
50884 }
50885
50886 // GCC allows "st(0)" to be called just plain "st".
50887 if (StringRef("{st}").equals_lower(Constraint))
50888 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
50889 }
50890
50891 // flags -> EFLAGS
50892 if (StringRef("{flags}").equals_lower(Constraint))
50893 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
50894
50895 // dirflag -> DF
50896 // Only allow for clobber.
50897 if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other)
50898 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
50899
50900 // fpsr -> FPSW
50901 if (StringRef("{fpsr}").equals_lower(Constraint))
50902 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
50903
50904 return Res;
50905 }
50906
50907 // Make sure it isn't a register that requires 64-bit mode.
50908 if (!Subtarget.is64Bit() &&
50909 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
50910 TRI->getEncodingValue(Res.first) >= 8) {
50911 // Register requires REX prefix, but we're in 32-bit mode.
50912 return std::make_pair(0, nullptr);
50913 }
50914
50915 // Make sure it isn't a register that requires AVX512.
50916 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
50917 TRI->getEncodingValue(Res.first) & 0x10) {
50918 // Register requires EVEX prefix.
50919 return std::make_pair(0, nullptr);
50920 }
50921
50922 // Otherwise, check to see if this is a register class of the wrong value
50923 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
50924 // turn into {ax},{dx}.
50925 // MVT::Other is used to specify clobber names.
50926 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
50927 return Res; // Correct type already, nothing to do.
50928
50929 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
50930 // return "eax". This should even work for things like getting 64bit integer
50931 // registers when given an f64 type.
50932 const TargetRegisterClass *Class = Res.second;
50933 // The generic code will match the first register class that contains the
50934 // given register. Thus, based on the ordering of the tablegened file,
50935 // the "plain" GR classes might not come first.
50936 // Therefore, use a helper method.
50937 if (isGRClass(*Class)) {
50938 unsigned Size = VT.getSizeInBits();
50939 if (Size == 1) Size = 8;
50940 Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
50941 if (DestReg > 0) {
50942 bool is64Bit = Subtarget.is64Bit();
50943 const TargetRegisterClass *RC =
50944 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
50945 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
50946 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
50947 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
50948 : nullptr;
50949 if (Size == 64 && !is64Bit) {
50950 // Model GCC's behavior here and select a fixed pair of 32-bit
50951 // registers.
50952 switch (DestReg) {
50953 case X86::RAX:
50954 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
50955 case X86::RDX:
50956 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
50957 case X86::RCX:
50958 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
50959 case X86::RBX:
50960 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
50961 case X86::RSI:
50962 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
50963 case X86::RDI:
50964 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
50965 case X86::RBP:
50966 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
50967 default:
50968 return std::make_pair(0, nullptr);
50969 }
50970 }
50971 if (RC && RC->contains(DestReg))
50972 return std::make_pair(DestReg, RC);
50973 return Res;
50974 }
50975 // No register found/type mismatch.
50976 return std::make_pair(0, nullptr);
50977 } else if (isFRClass(*Class)) {
50978 // Handle references to XMM physical registers that got mapped into the
50979 // wrong class. This can happen with constraints like {xmm0} where the
50980 // target independent register mapper will just pick the first match it can
50981 // find, ignoring the required type.
50982
50983 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
50984 if (VT == MVT::f32 || VT == MVT::i32)
50985 Res.second = &X86::FR32XRegClass;
50986 else if (VT == MVT::f64 || VT == MVT::i64)
50987 Res.second = &X86::FR64XRegClass;
50988 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
50989 Res.second = &X86::VR128XRegClass;
50990 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
50991 Res.second = &X86::VR256XRegClass;
50992 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
50993 Res.second = &X86::VR512RegClass;
50994 else {
50995 // Type mismatch and not a clobber: Return an error;
50996 Res.first = 0;
50997 Res.second = nullptr;
50998 }
50999 } else if (isVKClass(*Class)) {
51000 if (VT == MVT::i1)
51001 Res.second = &X86::VK1RegClass;
51002 else if (VT == MVT::i8)
51003 Res.second = &X86::VK8RegClass;
51004 else if (VT == MVT::i16)
51005 Res.second = &X86::VK16RegClass;
51006 else if (VT == MVT::i32)
51007 Res.second = &X86::VK32RegClass;
51008 else if (VT == MVT::i64)
51009 Res.second = &X86::VK64RegClass;
51010 else {
51011 // Type mismatch and not a clobber: Return an error;
51012 Res.first = 0;
51013 Res.second = nullptr;
51014 }
51015 }
51016
51017 return Res;
51018}
51019
51020int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
51021 const AddrMode &AM, Type *Ty,
51022 unsigned AS) const {
51023 // Scaling factors are not free at all.
51024 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
51025 // will take 2 allocations in the out of order engine instead of 1
51026 // for plain addressing mode, i.e. inst (reg1).
51027 // E.g.,
51028 // vaddps (%rsi,%rdx), %ymm0, %ymm1
51029 // Requires two allocations (one for the load, one for the computation)
51030 // whereas:
51031 // vaddps (%rsi), %ymm0, %ymm1
51032 // Requires just 1 allocation, i.e., freeing allocations for other operations
51033 // and having less micro operations to execute.
51034 //
51035 // For some X86 architectures, this is even worse because for instance for
51036 // stores, the complex addressing mode forces the instruction to use the
51037 // "load" ports instead of the dedicated "store" port.
51038 // E.g., on Haswell:
51039 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
51040 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
51041 if (isLegalAddressingMode(DL, AM, Ty, AS))
51042 // Scale represents reg2 * scale, thus account for 1
51043 // as soon as we use a second register.
51044 return AM.Scale != 0;
51045 return -1;
51046}
51047
51048bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
51049 // Integer division on x86 is expensive. However, when aggressively optimizing
51050 // for code size, we prefer to use a div instruction, as it is usually smaller
51051 // than the alternative sequence.
51052 // The exception to this is vector division. Since x86 doesn't have vector
51053 // integer division, leaving the division as-is is a loss even in terms of
51054 // size, because it will have to be scalarized, while the alternative code
51055 // sequence can be performed in vector form.
51056 bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
51057 return OptSize && !VT.isVector();
51058}
51059
51060void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
51061 if (!Subtarget.is64Bit())
51062 return;
51063
51064 // Update IsSplitCSR in X86MachineFunctionInfo.
51065 X86MachineFunctionInfo *AFI =
51066 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
51067 AFI->setIsSplitCSR(true);
51068}
51069
51070void X86TargetLowering::insertCopiesSplitCSR(
51071 MachineBasicBlock *Entry,
51072 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
51073 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
51074 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
51075 if (!IStart)
51076 return;
51077
51078 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
51079 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
51080 MachineBasicBlock::iterator MBBI = Entry->begin();
51081 for (const MCPhysReg *I = IStart; *I; ++I) {
51082 const TargetRegisterClass *RC = nullptr;
51083 if (X86::GR64RegClass.contains(*I))
51084 RC = &X86::GR64RegClass;
51085 else
51086 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51086)
;
51087
51088 Register NewVR = MRI->createVirtualRegister(RC);
51089 // Create copy from CSR to a virtual register.
51090 // FIXME: this currently does not emit CFI pseudo-instructions, it works
51091 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
51092 // nounwind. If we want to generalize this later, we may need to emit
51093 // CFI pseudo-instructions.
51094 assert(((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51096, __PRETTY_FUNCTION__))
51095 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51096, __PRETTY_FUNCTION__))
51096 "Function should be nounwind in insertCopiesSplitCSR!")((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51096, __PRETTY_FUNCTION__))
;
51097 Entry->addLiveIn(*I);
51098 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
51099 .addReg(*I);
51100
51101 // Insert the copy-back instructions right before the terminator.
51102 for (auto *Exit : Exits)
51103 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
51104 TII->get(TargetOpcode::COPY), *I)
51105 .addReg(NewVR);
51106 }
51107}
51108
51109bool X86TargetLowering::supportSwiftError() const {
51110 return Subtarget.is64Bit();
51111}
51112
51113/// Returns true if stack probing through a function call is requested.
51114bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
51115 return !getStackProbeSymbolName(MF).empty();
51116}
51117
51118/// Returns true if stack probing through inline assembly is requested.
51119bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
51120
51121 // No inline stack probe for Windows, they have their own mechanism.
51122 if (Subtarget.isOSWindows() ||
51123 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
51124 return false;
51125
51126 // If the function specifically requests inline stack probes, emit them.
51127 if (MF.getFunction().hasFnAttribute("probe-stack"))
51128 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
51129 "inline-asm";
51130
51131 return false;
51132}
51133
51134/// Returns the name of the symbol used to emit stack probes or the empty
51135/// string if not applicable.
51136StringRef
51137X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
51138 // Inline Stack probes disable stack probe call
51139 if (hasInlineStackProbe(MF))
51140 return "";
51141
51142 // If the function specifically requests stack probes, emit them.
51143 if (MF.getFunction().hasFnAttribute("probe-stack"))
51144 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
51145
51146 // Generally, if we aren't on Windows, the platform ABI does not include
51147 // support for stack probes, so don't emit them.
51148 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
51149 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
51150 return "";
51151
51152 // We need a stack probe to conform to the Windows ABI. Choose the right
51153 // symbol.
51154 if (Subtarget.is64Bit())
51155 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
51156 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
51157}
51158
51159unsigned
51160X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
51161 // The default stack probe size is 4096 if the function has no stackprobesize
51162 // attribute.
51163 unsigned StackProbeSize = 4096;
51164 const Function &Fn = MF.getFunction();
51165 if (Fn.hasFnAttribute("stack-probe-size"))
51166 Fn.getFnAttribute("stack-probe-size")
51167 .getValueAsString()
51168 .getAsInteger(0, StackProbeSize);
51169 return StackProbeSize;
51170}

/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h

1//===- CodeGen/ValueTypes.h - Low-Level Target independ. types --*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the set of low-level target independent types which various
10// values in the code generator are. This allows the target specific behavior
11// of instructions to be described to target independent passes.
12//
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_CODEGEN_VALUETYPES_H
16#define LLVM_CODEGEN_VALUETYPES_H
17
18#include "llvm/Support/Compiler.h"
19#include "llvm/Support/MachineValueType.h"
20#include "llvm/Support/MathExtras.h"
21#include "llvm/Support/TypeSize.h"
22#include "llvm/Support/WithColor.h"
23#include <cassert>
24#include <cstdint>
25#include <string>
26
27namespace llvm {
28
29 class LLVMContext;
30 class Type;
31
32 /// Extended Value Type. Capable of holding value types which are not native
33 /// for any processor (such as the i12345 type), as well as the types an MVT
34 /// can represent.
35 struct EVT {
36 private:
37 MVT V = MVT::INVALID_SIMPLE_VALUE_TYPE;
38 Type *LLVMTy = nullptr;
39
40 public:
41 constexpr EVT() = default;
42 constexpr EVT(MVT::SimpleValueType SVT) : V(SVT) {}
43 constexpr EVT(MVT S) : V(S) {}
44
45 bool operator==(EVT VT) const {
46 return !(*this != VT);
47 }
48 bool operator!=(EVT VT) const {
49 if (V.SimpleTy != VT.V.SimpleTy)
50 return true;
51 if (V.SimpleTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
52 return LLVMTy != VT.LLVMTy;
53 return false;
54 }
55
56 /// Returns the EVT that represents a floating-point type with the given
57 /// number of bits. There are two floating-point types with 128 bits - this
58 /// returns f128 rather than ppcf128.
59 static EVT getFloatingPointVT(unsigned BitWidth) {
60 return MVT::getFloatingPointVT(BitWidth);
61 }
62
63 /// Returns the EVT that represents an integer with the given number of
64 /// bits.
65 static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth) {
66 MVT M = MVT::getIntegerVT(BitWidth);
67 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
68 return M;
69 return getExtendedIntegerVT(Context, BitWidth);
70 }
71
72 /// Returns the EVT that represents a vector NumElements in length, where
73 /// each element is of type VT.
74 static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements,
75 bool IsScalable = false) {
76 MVT M = MVT::getVectorVT(VT.V, NumElements, IsScalable);
77 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
78 return M;
79 return getExtendedVectorVT(Context, VT, NumElements, IsScalable);
80 }
81
82 /// Returns the EVT that represents a vector EC.Min elements in length,
83 /// where each element is of type VT.
84 static EVT getVectorVT(LLVMContext &Context, EVT VT, ElementCount EC) {
85 MVT M = MVT::getVectorVT(VT.V, EC);
86 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
87 return M;
88 return getExtendedVectorVT(Context, VT, EC);
89 }
90
91 /// Return a vector with the same number of elements as this vector, but
92 /// with the element type converted to an integer type with the same
93 /// bitwidth.
94 EVT changeVectorElementTypeToInteger() const {
95 if (isSimple())
96 return getSimpleVT().changeVectorElementTypeToInteger();
97 return changeExtendedVectorElementTypeToInteger();
98 }
99
100 /// Return a VT for a vector type whose attributes match ourselves
101 /// with the exception of the element type that is chosen by the caller.
102 EVT changeVectorElementType(EVT EltVT) const {
103 if (isSimple() && EltVT.isSimple())
104 return getSimpleVT().changeVectorElementType(EltVT.getSimpleVT());
105 return changeExtendedVectorElementType(EltVT);
106 }
107
108 /// Return the type converted to an equivalently sized integer or vector
109 /// with integer element type. Similar to changeVectorElementTypeToInteger,
110 /// but also handles scalars.
111 EVT changeTypeToInteger() {
112 if (isVector())
113 return changeVectorElementTypeToInteger();
114
115 if (isSimple())
116 return getSimpleVT().changeTypeToInteger();
117 return changeExtendedTypeToInteger();
118 }
119
120 /// Test if the given EVT is simple (as opposed to being extended).
121 bool isSimple() const {
122 return V.SimpleTy
18.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
25.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
18.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
25.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
18.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
25.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
18.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
25.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
18.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
25.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
18.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
25.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
!= MVT::INVALID_SIMPLE_VALUE_TYPE
;
19
Returning the value 1, which participates in a condition later
26
Returning the value 1, which participates in a condition later
123 }
124
125 /// Test if the given EVT is extended (as opposed to being simple).
126 bool isExtended() const {
127 return !isSimple();
128 }
129
130 /// Return true if this is a FP or a vector FP type.
131 bool isFloatingPoint() const {
132 return isSimple() ? V.isFloatingPoint() : isExtendedFloatingPoint();
133 }
134
135 /// Return true if this is an integer or a vector integer type.
136 bool isInteger() const {
137 return isSimple() ? V.isInteger() : isExtendedInteger();
138 }
139
140 /// Return true if this is an integer, but not a vector.
141 bool isScalarInteger() const {
142 return isSimple() ? V.isScalarInteger() : isExtendedScalarInteger();
143 }
144
145 /// Return true if this is a vector value type.
146 bool isVector() const {
147 return isSimple() ? V.isVector() : isExtendedVector();
10
'?' condition is true
11
Calling 'MVT::isVector'
15
Returning from 'MVT::isVector'
16
Returning the value 1, which participates in a condition later
29
'?' condition is true
30
Calling 'MVT::isVector'
32
Returning from 'MVT::isVector'
33
Returning the value 1, which participates in a condition later
148 }
149
150 /// Return true if this is a vector type where the runtime
151 /// length is machine dependent
152 bool isScalableVector() const {
153 return isSimple() ? V.isScalableVector() : isExtendedScalableVector();
154 }
155
156 bool isFixedLengthVector() const {
157 return isSimple() ? V.isFixedLengthVector()
158 : isExtendedFixedLengthVector();
159 }
160
161 /// Return true if this is a 16-bit vector type.
162 bool is16BitVector() const {
163 return isSimple() ? V.is16BitVector() : isExtended16BitVector();
164 }
165
166 /// Return true if this is a 32-bit vector type.
167 bool is32BitVector() const {
168 return isSimple() ? V.is32BitVector() : isExtended32BitVector();
169 }
170
171 /// Return true if this is a 64-bit vector type.
172 bool is64BitVector() const {
173 return isSimple() ? V.is64BitVector() : isExtended64BitVector();
174 }
175
176 /// Return true if this is a 128-bit vector type.
177 bool is128BitVector() const {
178 return isSimple() ? V.is128BitVector() : isExtended128BitVector();
179 }
180
181 /// Return true if this is a 256-bit vector type.
182 bool is256BitVector() const {
183 return isSimple() ? V.is256BitVector() : isExtended256BitVector();
184 }
185
186 /// Return true if this is a 512-bit vector type.
187 bool is512BitVector() const {
188 return isSimple() ? V.is512BitVector() : isExtended512BitVector();
189 }
190
191 /// Return true if this is a 1024-bit vector type.
192 bool is1024BitVector() const {
193 return isSimple() ? V.is1024BitVector() : isExtended1024BitVector();
194 }
195
196 /// Return true if this is a 2048-bit vector type.
197 bool is2048BitVector() const {
198 return isSimple() ? V.is2048BitVector() : isExtended2048BitVector();
199 }
200
201 /// Return true if this is an overloaded type for TableGen.
202 bool isOverloaded() const {
203 return (V==MVT::iAny || V==MVT::fAny || V==MVT::vAny || V==MVT::iPTRAny);
204 }
205
206 /// Return true if the bit size is a multiple of 8.
207 bool isByteSized() const { return getSizeInBits().isKnownMultipleOf(8); }
208
209 /// Return true if the size is a power-of-two number of bytes.
210 bool isRound() const {
211 if (isScalableVector())
212 return false;
213 unsigned BitSize = getSizeInBits();
214 return BitSize >= 8 && !(BitSize & (BitSize - 1));
215 }
216
217 /// Return true if this has the same number of bits as VT.
218 bool bitsEq(EVT VT) const {
219 if (EVT::operator==(VT)) return true;
220 return getSizeInBits() == VT.getSizeInBits();
221 }
222
223 /// Return true if we know at compile time this has more bits than VT.
224 bool knownBitsGT(EVT VT) const {
225 return TypeSize::isKnownGT(getSizeInBits(), VT.getSizeInBits());
226 }
227
228 /// Return true if we know at compile time this has more than or the same
229 /// bits as VT.
230 bool knownBitsGE(EVT VT) const {
231 return TypeSize::isKnownGE(getSizeInBits(), VT.getSizeInBits());
232 }
233
234 /// Return true if we know at compile time this has fewer bits than VT.
235 bool knownBitsLT(EVT VT) const {
236 return TypeSize::isKnownLT(getSizeInBits(), VT.getSizeInBits());
237 }
238
239 /// Return true if we know at compile time this has fewer than or the same
240 /// bits as VT.
241 bool knownBitsLE(EVT VT) const {
242 return TypeSize::isKnownLE(getSizeInBits(), VT.getSizeInBits());
243 }
244
245 /// Return true if this has more bits than VT.
246 bool bitsGT(EVT VT) const {
247 if (EVT::operator==(VT)) return false;
248 assert(isScalableVector() == VT.isScalableVector() &&((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 249, __PRETTY_FUNCTION__))
249 "Comparison between scalable and fixed types")((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 249, __PRETTY_FUNCTION__))
;
250 return knownBitsGT(VT);
251 }
252
253 /// Return true if this has no less bits than VT.
254 bool bitsGE(EVT VT) const {
255 if (EVT::operator==(VT)) return true;
256 assert(isScalableVector() == VT.isScalableVector() &&((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 257, __PRETTY_FUNCTION__))
257 "Comparison between scalable and fixed types")((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 257, __PRETTY_FUNCTION__))
;
258 return knownBitsGE(VT);
259 }
260
261 /// Return true if this has less bits than VT.
262 bool bitsLT(EVT VT) const {
263 if (EVT::operator==(VT)) return false;
264 assert(isScalableVector() == VT.isScalableVector() &&((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 265, __PRETTY_FUNCTION__))
265 "Comparison between scalable and fixed types")((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 265, __PRETTY_FUNCTION__))
;
266 return knownBitsLT(VT);
267 }
268
269 /// Return true if this has no more bits than VT.
270 bool bitsLE(EVT VT) const {
271 if (EVT::operator==(VT)) return true;
272 assert(isScalableVector() == VT.isScalableVector() &&((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 273, __PRETTY_FUNCTION__))
273 "Comparison between scalable and fixed types")((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 273, __PRETTY_FUNCTION__))
;
274 return knownBitsLE(VT);
275 }
276
277 /// Return the SimpleValueType held in the specified simple EVT.
278 MVT getSimpleVT() const {
279 assert(isSimple() && "Expected a SimpleValueType!")((isSimple() && "Expected a SimpleValueType!") ? static_cast
<void> (0) : __assert_fail ("isSimple() && \"Expected a SimpleValueType!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 279, __PRETTY_FUNCTION__))
;
280 return V;
281 }
282
283 /// If this is a vector type, return the element type, otherwise return
284 /// this.
285 EVT getScalarType() const {
286 return isVector() ? getVectorElementType() : *this;
287 }
288
289 /// Given a vector type, return the type of each element.
290 EVT getVectorElementType() const {
291 assert(isVector() && "Invalid vector type!")((isVector() && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("isVector() && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 291, __PRETTY_FUNCTION__))
;
292 if (isSimple())
293 return V.getVectorElementType();
294 return getExtendedVectorElementType();
295 }
296
297 /// Given a vector type, return the number of elements it contains.
298 unsigned getVectorNumElements() const {
299#ifdef STRICT_FIXED_SIZE_VECTORS
300 assert(isFixedLengthVector() && "Invalid vector type!")((isFixedLengthVector() && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("isFixedLengthVector() && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 300, __PRETTY_FUNCTION__))
;
301#else
302 assert(isVector() && "Invalid vector type!")((isVector() && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("isVector() && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 302, __PRETTY_FUNCTION__))
;
303 if (isScalableVector())
304 WithColor::warning()
305 << "Possible incorrect use of EVT::getVectorNumElements() for "
306 "scalable vector. Scalable flag may be dropped, use"
307 "EVT::getVectorElementCount() instead\n";
308#endif
309 if (isSimple())
310 return V.getVectorNumElements();
311 return getExtendedVectorNumElements();
312 }
313
314 // Given a (possibly scalable) vector type, return the ElementCount
315 ElementCount getVectorElementCount() const {
316 assert((isVector()) && "Invalid vector type!")(((isVector()) && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("(isVector()) && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 316, __PRETTY_FUNCTION__))
;
317 if (isSimple())
318 return V.getVectorElementCount();
319
320 return getExtendedVectorElementCount();
321 }
322
323 /// Given a vector type, return the minimum number of elements it contains.
324 unsigned getVectorMinNumElements() const {
325 return getVectorElementCount().getKnownMinValue();
326 }
327
328 /// Return the size of the specified value type in bits.
329 ///
330 /// If the value type is a scalable vector type, the scalable property will
331 /// be set and the runtime size will be a positive integer multiple of the
332 /// base size.
333 TypeSize getSizeInBits() const {
334 if (isSimple())
335 return V.getSizeInBits();
336 return getExtendedSizeInBits();
337 }
338
339 /// Return the size of the specified fixed width value type in bits. The
340 /// function will assert if the type is scalable.
341 uint64_t getFixedSizeInBits() const {
342 return getSizeInBits().getFixedSize();
343 }
344
345 uint64_t getScalarSizeInBits() const {
346 return getScalarType().getSizeInBits().getFixedSize();
347 }
348
349 /// Return the number of bytes overwritten by a store of the specified value
350 /// type.
351 ///
352 /// If the value type is a scalable vector type, the scalable property will
353 /// be set and the runtime size will be a positive integer multiple of the
354 /// base size.
355 TypeSize getStoreSize() const {
356 TypeSize BaseSize = getSizeInBits();
357 return {(BaseSize.getKnownMinSize() + 7) / 8, BaseSize.isScalable()};
358 }
359
360 /// Return the number of bits overwritten by a store of the specified value
361 /// type.
362 ///
363 /// If the value type is a scalable vector type, the scalable property will
364 /// be set and the runtime size will be a positive integer multiple of the
365 /// base size.
366 TypeSize getStoreSizeInBits() const {
367 return getStoreSize() * 8;
368 }
369
370 /// Rounds the bit-width of the given integer EVT up to the nearest power of
371 /// two (and at least to eight), and returns the integer EVT with that
372 /// number of bits.
373 EVT getRoundIntegerType(LLVMContext &Context) const {
374 assert(isInteger() && !isVector() && "Invalid integer type!")((isInteger() && !isVector() && "Invalid integer type!"
) ? static_cast<void> (0) : __assert_fail ("isInteger() && !isVector() && \"Invalid integer type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 374, __PRETTY_FUNCTION__))
;
375 unsigned BitWidth = getSizeInBits();
376 if (BitWidth <= 8)
377 return EVT(MVT::i8);
378 return getIntegerVT(Context, 1 << Log2_32_Ceil(BitWidth));
379 }
380
381 /// Finds the smallest simple value type that is greater than or equal to
382 /// half the width of this EVT. If no simple value type can be found, an
383 /// extended integer value type of half the size (rounded up) is returned.
384 EVT getHalfSizedIntegerVT(LLVMContext &Context) const {
385 assert(isInteger() && !isVector() && "Invalid integer type!")((isInteger() && !isVector() && "Invalid integer type!"
) ? static_cast<void> (0) : __assert_fail ("isInteger() && !isVector() && \"Invalid integer type!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 385, __PRETTY_FUNCTION__))
;
386 unsigned EVTSize = getSizeInBits();
387 for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
388 IntVT <= MVT::LAST_INTEGER_VALUETYPE; ++IntVT) {
389 EVT HalfVT = EVT((MVT::SimpleValueType)IntVT);
390 if (HalfVT.getSizeInBits() * 2 >= EVTSize)
391 return HalfVT;
392 }
393 return getIntegerVT(Context, (EVTSize + 1) / 2);
394 }
395
396 /// Return a VT for an integer vector type with the size of the
397 /// elements doubled. The typed returned may be an extended type.
398 EVT widenIntegerVectorElementType(LLVMContext &Context) const {
399 EVT EltVT = getVectorElementType();
400 EltVT = EVT::getIntegerVT(Context, 2 * EltVT.getSizeInBits());
401 return EVT::getVectorVT(Context, EltVT, getVectorElementCount());
402 }
403
404 // Return a VT for a vector type with the same element type but
405 // half the number of elements. The type returned may be an
406 // extended type.
407 EVT getHalfNumVectorElementsVT(LLVMContext &Context) const {
408 EVT EltVT = getVectorElementType();
409 auto EltCnt = getVectorElementCount();
410 assert(EltCnt.isKnownEven() && "Splitting vector, but not in half!")((EltCnt.isKnownEven() && "Splitting vector, but not in half!"
) ? static_cast<void> (0) : __assert_fail ("EltCnt.isKnownEven() && \"Splitting vector, but not in half!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/CodeGen/ValueTypes.h"
, 410, __PRETTY_FUNCTION__))
;
411 return EVT::getVectorVT(Context, EltVT, EltCnt.divideCoefficientBy(2));
412 }
413
414 // Return a VT for a vector type with the same element type but
415 // double the number of elements. The type returned may be an
416 // extended type.
417 EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const {
418 EVT EltVT = getVectorElementType();
419 auto EltCnt = getVectorElementCount();
420 return EVT::getVectorVT(Context, EltVT, EltCnt * 2);
421 }
422
423 /// Returns true if the given vector is a power of 2.
424 bool isPow2VectorType() const {
425 unsigned NElts = getVectorMinNumElements();
426 return !(NElts & (NElts - 1));
427 }
428
429 /// Widens the length of the given vector EVT up to the nearest power of 2
430 /// and returns that type.
431 EVT getPow2VectorType(LLVMContext &Context) const {
432 if (!isPow2VectorType()) {
433 ElementCount NElts = getVectorElementCount();
434 unsigned NewMinCount = 1 << Log2_32_Ceil(NElts.getKnownMinValue());
435 NElts = ElementCount::get(NewMinCount, NElts.isScalable());
436 return EVT::getVectorVT(Context, getVectorElementType(), NElts);
437 }
438 else {
439 return *this;
440 }
441 }
442
443 /// This function returns value type as a string, e.g. "i32".
444 std::string getEVTString() const;
445
446 /// This method returns an LLVM type corresponding to the specified EVT.
447 /// For integer types, this returns an unsigned type. Note that this will
448 /// abort for types that cannot be represented.
449 Type *getTypeForEVT(LLVMContext &Context) const;
450
451 /// Return the value type corresponding to the specified type.
452 /// This returns all pointers as iPTR. If HandleUnknown is true, unknown
453 /// types are returned as Other, otherwise they are invalid.
454 static EVT getEVT(Type *Ty, bool HandleUnknown = false);
455
456 intptr_t getRawBits() const {
457 if (isSimple())
458 return V.SimpleTy;
459 else
460 return (intptr_t)(LLVMTy);
461 }
462
463 /// A meaningless but well-behaved order, useful for constructing
464 /// containers.
465 struct compareRawBits {
466 bool operator()(EVT L, EVT R) const {
467 if (L.V.SimpleTy == R.V.SimpleTy)
468 return L.LLVMTy < R.LLVMTy;
469 else
470 return L.V.SimpleTy < R.V.SimpleTy;
471 }
472 };
473
474 private:
475 // Methods for handling the Extended-type case in functions above.
476 // These are all out-of-line to prevent users of this header file
477 // from having a dependency on Type.h.
478 EVT changeExtendedTypeToInteger() const;
479 EVT changeExtendedVectorElementType(EVT EltVT) const;
480 EVT changeExtendedVectorElementTypeToInteger() const;
481 static EVT getExtendedIntegerVT(LLVMContext &C, unsigned BitWidth);
482 static EVT getExtendedVectorVT(LLVMContext &C, EVT VT, unsigned NumElements,
483 bool IsScalable);
484 static EVT getExtendedVectorVT(LLVMContext &Context, EVT VT,
485 ElementCount EC);
486 bool isExtendedFloatingPoint() const LLVM_READONLY__attribute__((__pure__));
487 bool isExtendedInteger() const LLVM_READONLY__attribute__((__pure__));
488 bool isExtendedScalarInteger() const LLVM_READONLY__attribute__((__pure__));
489 bool isExtendedVector() const LLVM_READONLY__attribute__((__pure__));
490 bool isExtended16BitVector() const LLVM_READONLY__attribute__((__pure__));
491 bool isExtended32BitVector() const LLVM_READONLY__attribute__((__pure__));
492 bool isExtended64BitVector() const LLVM_READONLY__attribute__((__pure__));
493 bool isExtended128BitVector() const LLVM_READONLY__attribute__((__pure__));
494 bool isExtended256BitVector() const LLVM_READONLY__attribute__((__pure__));
495 bool isExtended512BitVector() const LLVM_READONLY__attribute__((__pure__));
496 bool isExtended1024BitVector() const LLVM_READONLY__attribute__((__pure__));
497 bool isExtended2048BitVector() const LLVM_READONLY__attribute__((__pure__));
498 bool isExtendedFixedLengthVector() const LLVM_READONLY__attribute__((__pure__));
499 bool isExtendedScalableVector() const LLVM_READONLY__attribute__((__pure__));
500 EVT getExtendedVectorElementType() const;
501 unsigned getExtendedVectorNumElements() const LLVM_READONLY__attribute__((__pure__));
502 ElementCount getExtendedVectorElementCount() const LLVM_READONLY__attribute__((__pure__));
503 TypeSize getExtendedSizeInBits() const LLVM_READONLY__attribute__((__pure__));
504 };
505
506} // end namespace llvm
507
508#endif // LLVM_CODEGEN_VALUETYPES_H

/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h

1//===- Support/MachineValueType.h - Machine-Level types ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the set of machine-level target independent types which
10// legal values in the code generator use.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_SUPPORT_MACHINEVALUETYPE_H
15#define LLVM_SUPPORT_MACHINEVALUETYPE_H
16
17#include "llvm/ADT/iterator_range.h"
18#include "llvm/Support/ErrorHandling.h"
19#include "llvm/Support/MathExtras.h"
20#include "llvm/Support/TypeSize.h"
21#include <cassert>
22
23namespace llvm {
24
25 class Type;
26
27 /// Machine Value Type. Every type that is supported natively by some
28 /// processor targeted by LLVM occurs here. This means that any legal value
29 /// type can be represented by an MVT.
30 class MVT {
31 public:
32 enum SimpleValueType : uint8_t {
33 // Simple value types that aren't explicitly part of this enumeration
34 // are considered extended value types.
35 INVALID_SIMPLE_VALUE_TYPE = 0,
36
37 // If you change this numbering, you must change the values in
38 // ValueTypes.td as well!
39 Other = 1, // This is a non-standard value
40 i1 = 2, // This is a 1 bit integer value
41 i8 = 3, // This is an 8 bit integer value
42 i16 = 4, // This is a 16 bit integer value
43 i32 = 5, // This is a 32 bit integer value
44 i64 = 6, // This is a 64 bit integer value
45 i128 = 7, // This is a 128 bit integer value
46
47 FIRST_INTEGER_VALUETYPE = i1,
48 LAST_INTEGER_VALUETYPE = i128,
49
50 bf16 = 8, // This is a 16 bit brain floating point value
51 f16 = 9, // This is a 16 bit floating point value
52 f32 = 10, // This is a 32 bit floating point value
53 f64 = 11, // This is a 64 bit floating point value
54 f80 = 12, // This is a 80 bit floating point value
55 f128 = 13, // This is a 128 bit floating point value
56 ppcf128 = 14, // This is a PPC 128-bit floating point value
57
58 FIRST_FP_VALUETYPE = bf16,
59 LAST_FP_VALUETYPE = ppcf128,
60
61 v1i1 = 15, // 1 x i1
62 v2i1 = 16, // 2 x i1
63 v4i1 = 17, // 4 x i1
64 v8i1 = 18, // 8 x i1
65 v16i1 = 19, // 16 x i1
66 v32i1 = 20, // 32 x i1
67 v64i1 = 21, // 64 x i1
68 v128i1 = 22, // 128 x i1
69 v256i1 = 23, // 256 x i1
70 v512i1 = 24, // 512 x i1
71 v1024i1 = 25, // 1024 x i1
72
73 v1i8 = 26, // 1 x i8
74 v2i8 = 27, // 2 x i8
75 v4i8 = 28, // 4 x i8
76 v8i8 = 29, // 8 x i8
77 v16i8 = 30, // 16 x i8
78 v32i8 = 31, // 32 x i8
79 v64i8 = 32, // 64 x i8
80 v128i8 = 33, //128 x i8
81 v256i8 = 34, //256 x i8
82
83 v1i16 = 35, // 1 x i16
84 v2i16 = 36, // 2 x i16
85 v3i16 = 37, // 3 x i16
86 v4i16 = 38, // 4 x i16
87 v8i16 = 39, // 8 x i16
88 v16i16 = 40, // 16 x i16
89 v32i16 = 41, // 32 x i16
90 v64i16 = 42, // 64 x i16
91 v128i16 = 43, //128 x i16
92
93 v1i32 = 44, // 1 x i32
94 v2i32 = 45, // 2 x i32
95 v3i32 = 46, // 3 x i32
96 v4i32 = 47, // 4 x i32
97 v5i32 = 48, // 5 x i32
98 v8i32 = 49, // 8 x i32
99 v16i32 = 50, // 16 x i32
100 v32i32 = 51, // 32 x i32
101 v64i32 = 52, // 64 x i32
102 v128i32 = 53, // 128 x i32
103 v256i32 = 54, // 256 x i32
104 v512i32 = 55, // 512 x i32
105 v1024i32 = 56, // 1024 x i32
106 v2048i32 = 57, // 2048 x i32
107
108 v1i64 = 58, // 1 x i64
109 v2i64 = 59, // 2 x i64
110 v4i64 = 60, // 4 x i64
111 v8i64 = 61, // 8 x i64
112 v16i64 = 62, // 16 x i64
113 v32i64 = 63, // 32 x i64
114 v64i64 = 64, // 64 x i64
115 v128i64 = 65, // 128 x i64
116 v256i64 = 66, // 256 x i64
117
118 v1i128 = 67, // 1 x i128
119
120 FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
121 LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i128,
122
123 v2f16 = 68, // 2 x f16
124 v3f16 = 69, // 3 x f16
125 v4f16 = 70, // 4 x f16
126 v8f16 = 71, // 8 x f16
127 v16f16 = 72, // 16 x f16
128 v32f16 = 73, // 32 x f16
129 v64f16 = 74, // 64 x f16
130 v128f16 = 75, // 128 x f16
131 v2bf16 = 76, // 2 x bf16
132 v3bf16 = 77, // 3 x bf16
133 v4bf16 = 78, // 4 x bf16
134 v8bf16 = 79, // 8 x bf16
135 v16bf16 = 80, // 16 x bf16
136 v32bf16 = 81, // 32 x bf16
137 v64bf16 = 82, // 64 x bf16
138 v128bf16 = 83, // 128 x bf16
139 v1f32 = 84, // 1 x f32
140 v2f32 = 85, // 2 x f32
141 v3f32 = 86, // 3 x f32
142 v4f32 = 87, // 4 x f32
143 v5f32 = 88, // 5 x f32
144 v8f32 = 89, // 8 x f32
145 v16f32 = 90, // 16 x f32
146 v32f32 = 91, // 32 x f32
147 v64f32 = 92, // 64 x f32
148 v128f32 = 93, // 128 x f32
149 v256f32 = 94, // 256 x f32
150 v512f32 = 95, // 512 x f32
151 v1024f32 = 96, // 1024 x f32
152 v2048f32 = 97, // 2048 x f32
153 v1f64 = 98, // 1 x f64
154 v2f64 = 99, // 2 x f64
155 v4f64 = 100, // 4 x f64
156 v8f64 = 101, // 8 x f64
157 v16f64 = 102, // 16 x f64
158 v32f64 = 103, // 32 x f64
159 v64f64 = 104, // 64 x f64
160 v128f64 = 105, // 128 x f64
161 v256f64 = 106, // 256 x f64
162
163 FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE = v2f16,
164 LAST_FP_FIXEDLEN_VECTOR_VALUETYPE = v256f64,
165
166 FIRST_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
167 LAST_FIXEDLEN_VECTOR_VALUETYPE = v256f64,
168
169 nxv1i1 = 107, // n x 1 x i1
170 nxv2i1 = 108, // n x 2 x i1
171 nxv4i1 = 109, // n x 4 x i1
172 nxv8i1 = 110, // n x 8 x i1
173 nxv16i1 = 111, // n x 16 x i1
174 nxv32i1 = 112, // n x 32 x i1
175 nxv64i1 = 113, // n x 64 x i1
176
177 nxv1i8 = 114, // n x 1 x i8
178 nxv2i8 = 115, // n x 2 x i8
179 nxv4i8 = 116, // n x 4 x i8
180 nxv8i8 = 117, // n x 8 x i8
181 nxv16i8 = 118, // n x 16 x i8
182 nxv32i8 = 119, // n x 32 x i8
183 nxv64i8 = 120, // n x 64 x i8
184
185 nxv1i16 = 121, // n x 1 x i16
186 nxv2i16 = 122, // n x 2 x i16
187 nxv4i16 = 123, // n x 4 x i16
188 nxv8i16 = 124, // n x 8 x i16
189 nxv16i16 = 125, // n x 16 x i16
190 nxv32i16 = 126, // n x 32 x i16
191
192 nxv1i32 = 127, // n x 1 x i32
193 nxv2i32 = 128, // n x 2 x i32
194 nxv4i32 = 129, // n x 4 x i32
195 nxv8i32 = 130, // n x 8 x i32
196 nxv16i32 = 131, // n x 16 x i32
197 nxv32i32 = 132, // n x 32 x i32
198
199 nxv1i64 = 133, // n x 1 x i64
200 nxv2i64 = 134, // n x 2 x i64
201 nxv4i64 = 135, // n x 4 x i64
202 nxv8i64 = 136, // n x 8 x i64
203 nxv16i64 = 137, // n x 16 x i64
204 nxv32i64 = 138, // n x 32 x i64
205
206 FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
207 LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv32i64,
208
209 nxv1f16 = 139, // n x 1 x f16
210 nxv2f16 = 140, // n x 2 x f16
211 nxv4f16 = 141, // n x 4 x f16
212 nxv8f16 = 142, // n x 8 x f16
213 nxv16f16 = 143, // n x 16 x f16
214 nxv32f16 = 144, // n x 32 x f16
215 nxv2bf16 = 145, // n x 2 x bf16
216 nxv4bf16 = 146, // n x 4 x bf16
217 nxv8bf16 = 147, // n x 8 x bf16
218 nxv1f32 = 148, // n x 1 x f32
219 nxv2f32 = 149, // n x 2 x f32
220 nxv4f32 = 150, // n x 4 x f32
221 nxv8f32 = 151, // n x 8 x f32
222 nxv16f32 = 152, // n x 16 x f32
223 nxv1f64 = 153, // n x 1 x f64
224 nxv2f64 = 154, // n x 2 x f64
225 nxv4f64 = 155, // n x 4 x f64
226 nxv8f64 = 156, // n x 8 x f64
227
228 FIRST_FP_SCALABLE_VECTOR_VALUETYPE = nxv1f16,
229 LAST_FP_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
230
231 FIRST_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
232 LAST_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
233
234 FIRST_VECTOR_VALUETYPE = v1i1,
235 LAST_VECTOR_VALUETYPE = nxv8f64,
236
237 x86mmx = 157, // This is an X86 MMX value
238
239 Glue = 158, // This glues nodes together during pre-RA sched
240
241 isVoid = 159, // This has no value
242
243 Untyped = 160, // This value takes a register, but has
244 // unspecified type. The register class
245 // will be determined by the opcode.
246
247 exnref = 161, // WebAssembly's exnref type
248 funcref = 162, // WebAssembly's funcref type
249 externref = 163, // WebAssembly's externref type
250
251 FIRST_VALUETYPE = 1, // This is always the beginning of the list.
252 LAST_VALUETYPE = 164, // This always remains at the end of the list.
253
254 // This is the current maximum for LAST_VALUETYPE.
255 // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
256 // This value must be a multiple of 32.
257 MAX_ALLOWED_VALUETYPE = 192,
258
259 // A value of type llvm::TokenTy
260 token = 248,
261
262 // This is MDNode or MDString.
263 Metadata = 249,
264
265 // An int value the size of the pointer of the current
266 // target to any address space. This must only be used internal to
267 // tblgen. Other than for overloading, we treat iPTRAny the same as iPTR.
268 iPTRAny = 250,
269
270 // A vector with any length and element size. This is used
271 // for intrinsics that have overloadings based on vector types.
272 // This is only for tblgen's consumption!
273 vAny = 251,
274
275 // Any floating-point or vector floating-point value. This is used
276 // for intrinsics that have overloadings based on floating-point types.
277 // This is only for tblgen's consumption!
278 fAny = 252,
279
280 // An integer or vector integer value of any bit width. This is
281 // used for intrinsics that have overloadings based on integer bit widths.
282 // This is only for tblgen's consumption!
283 iAny = 253,
284
285 // An int value the size of the pointer of the current
286 // target. This should only be used internal to tblgen!
287 iPTR = 254,
288
289 // Any type. This is used for intrinsics that have overloadings.
290 // This is only for tblgen's consumption!
291 Any = 255
292 };
293
294 SimpleValueType SimpleTy = INVALID_SIMPLE_VALUE_TYPE;
295
296 constexpr MVT() = default;
297 constexpr MVT(SimpleValueType SVT) : SimpleTy(SVT) {}
298
299 bool operator>(const MVT& S) const { return SimpleTy > S.SimpleTy; }
300 bool operator<(const MVT& S) const { return SimpleTy < S.SimpleTy; }
301 bool operator==(const MVT& S) const { return SimpleTy == S.SimpleTy; }
302 bool operator!=(const MVT& S) const { return SimpleTy != S.SimpleTy; }
303 bool operator>=(const MVT& S) const { return SimpleTy >= S.SimpleTy; }
304 bool operator<=(const MVT& S) const { return SimpleTy <= S.SimpleTy; }
305
306 /// Return true if this is a valid simple valuetype.
307 bool isValid() const {
308 return (SimpleTy >= MVT::FIRST_VALUETYPE &&
309 SimpleTy < MVT::LAST_VALUETYPE);
310 }
311
312 /// Return true if this is a FP or a vector FP type.
313 bool isFloatingPoint() const {
314 return ((SimpleTy >= MVT::FIRST_FP_VALUETYPE &&
315 SimpleTy <= MVT::LAST_FP_VALUETYPE) ||
316 (SimpleTy >= MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE &&
317 SimpleTy <= MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE) ||
318 (SimpleTy >= MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE &&
319 SimpleTy <= MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE));
320 }
321
322 /// Return true if this is an integer or a vector integer type.
323 bool isInteger() const {
324 return ((SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
325 SimpleTy <= MVT::LAST_INTEGER_VALUETYPE) ||
326 (SimpleTy >= MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE &&
327 SimpleTy <= MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE) ||
328 (SimpleTy >= MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE &&
329 SimpleTy <= MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE));
330 }
331
332 /// Return true if this is an integer, not including vectors.
333 bool isScalarInteger() const {
334 return (SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
335 SimpleTy <= MVT::LAST_INTEGER_VALUETYPE);
336 }
337
338 /// Return true if this is a vector value type.
339 bool isVector() const {
340 return (SimpleTy
30.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
30.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
30.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
30.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
30.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
30.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
>= MVT::FIRST_VECTOR_VALUETYPE
&&
12
Assuming field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
14
Returning the value 1, which participates in a condition later
31
Returning the value 1, which participates in a condition later
341 SimpleTy
30.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
30.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
30.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
30.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
30.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
30.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
<= MVT::LAST_VECTOR_VALUETYPE
)
;
13
Assuming field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
342 }
343
344 /// Return true if this is a vector value type where the
345 /// runtime length is machine dependent
346 bool isScalableVector() const {
347 return (SimpleTy >= MVT::FIRST_SCALABLE_VECTOR_VALUETYPE &&
348 SimpleTy <= MVT::LAST_SCALABLE_VECTOR_VALUETYPE);
349 }
350
351 bool isFixedLengthVector() const {
352 return (SimpleTy >= MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE &&
353 SimpleTy <= MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE);
354 }
355
356 /// Return true if this is a 16-bit vector type.
357 bool is16BitVector() const {
358 return (SimpleTy == MVT::v2i8 || SimpleTy == MVT::v1i16 ||
359 SimpleTy == MVT::v16i1);
360 }
361
362 /// Return true if this is a 32-bit vector type.
363 bool is32BitVector() const {
364 return (SimpleTy == MVT::v32i1 || SimpleTy == MVT::v4i8 ||
365 SimpleTy == MVT::v2i16 || SimpleTy == MVT::v1i32 ||
366 SimpleTy == MVT::v2f16 || SimpleTy == MVT::v2bf16 ||
367 SimpleTy == MVT::v1f32);
368 }
369
370 /// Return true if this is a 64-bit vector type.
371 bool is64BitVector() const {
372 return (SimpleTy == MVT::v64i1 || SimpleTy == MVT::v8i8 ||
373 SimpleTy == MVT::v4i16 || SimpleTy == MVT::v2i32 ||
374 SimpleTy == MVT::v1i64 || SimpleTy == MVT::v4f16 ||
375 SimpleTy == MVT::v4bf16 ||SimpleTy == MVT::v2f32 ||
376 SimpleTy == MVT::v1f64);
377 }
378
379 /// Return true if this is a 128-bit vector type.
380 bool is128BitVector() const {
381 return (SimpleTy == MVT::v128i1 || SimpleTy == MVT::v16i8 ||
382 SimpleTy == MVT::v8i16 || SimpleTy == MVT::v4i32 ||
383 SimpleTy == MVT::v2i64 || SimpleTy == MVT::v1i128 ||
384 SimpleTy == MVT::v8f16 || SimpleTy == MVT::v8bf16 ||
385 SimpleTy == MVT::v4f32 || SimpleTy == MVT::v2f64);
386 }
387
388 /// Return true if this is a 256-bit vector type.
389 bool is256BitVector() const {
390 return (SimpleTy == MVT::v16f16 || SimpleTy == MVT::v16bf16 ||
391 SimpleTy == MVT::v8f32 || SimpleTy == MVT::v4f64 ||
392 SimpleTy == MVT::v32i8 || SimpleTy == MVT::v16i16 ||
393 SimpleTy == MVT::v8i32 || SimpleTy == MVT::v4i64 ||
394 SimpleTy == MVT::v256i1);
395 }
396
397 /// Return true if this is a 512-bit vector type.
398 bool is512BitVector() const {
399 return (SimpleTy == MVT::v32f16 || SimpleTy == MVT::v32bf16 ||
400 SimpleTy == MVT::v16f32 || SimpleTy == MVT::v8f64 ||
401 SimpleTy == MVT::v512i1 || SimpleTy == MVT::v64i8 ||
402 SimpleTy == MVT::v32i16 || SimpleTy == MVT::v16i32 ||
403 SimpleTy == MVT::v8i64);
404 }
405
406 /// Return true if this is a 1024-bit vector type.
407 bool is1024BitVector() const {
408 return (SimpleTy == MVT::v1024i1 || SimpleTy == MVT::v128i8 ||
409 SimpleTy == MVT::v64i16 || SimpleTy == MVT::v32i32 ||
410 SimpleTy == MVT::v16i64 || SimpleTy == MVT::v64f16 ||
411 SimpleTy == MVT::v32f32 || SimpleTy == MVT::v16f64 ||
412 SimpleTy == MVT::v64bf16);
413 }
414
415 /// Return true if this is a 2048-bit vector type.
416 bool is2048BitVector() const {
417 return (SimpleTy == MVT::v256i8 || SimpleTy == MVT::v128i16 ||
418 SimpleTy == MVT::v64i32 || SimpleTy == MVT::v32i64 ||
419 SimpleTy == MVT::v128f16 || SimpleTy == MVT::v64f32 ||
420 SimpleTy == MVT::v32f64 || SimpleTy == MVT::v128bf16);
421 }
422
423 /// Return true if this is an overloaded type for TableGen.
424 bool isOverloaded() const {
425 return (SimpleTy == MVT::Any || SimpleTy == MVT::iAny ||
426 SimpleTy == MVT::fAny || SimpleTy == MVT::vAny ||
427 SimpleTy == MVT::iPTRAny);
428 }
429
430 /// Return a vector with the same number of elements as this vector, but
431 /// with the element type converted to an integer type with the same
432 /// bitwidth.
433 MVT changeVectorElementTypeToInteger() const {
434 MVT EltTy = getVectorElementType();
435 MVT IntTy = MVT::getIntegerVT(EltTy.getSizeInBits());
436 MVT VecTy = MVT::getVectorVT(IntTy, getVectorElementCount());
437 assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&((VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
"Simple vector VT not representable by simple integer vector VT!"
) ? static_cast<void> (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 438, __PRETTY_FUNCTION__))
438 "Simple vector VT not representable by simple integer vector VT!")((VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
"Simple vector VT not representable by simple integer vector VT!"
) ? static_cast<void> (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 438, __PRETTY_FUNCTION__))
;
439 return VecTy;
440 }
441
442 /// Return a VT for a vector type whose attributes match ourselves
443 /// with the exception of the element type that is chosen by the caller.
444 MVT changeVectorElementType(MVT EltVT) const {
445 MVT VecTy = MVT::getVectorVT(EltVT, getVectorElementCount());
446 assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&((VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
"Simple vector VT not representable by simple integer vector VT!"
) ? static_cast<void> (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 447, __PRETTY_FUNCTION__))
447 "Simple vector VT not representable by simple integer vector VT!")((VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
"Simple vector VT not representable by simple integer vector VT!"
) ? static_cast<void> (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 447, __PRETTY_FUNCTION__))
;
448 return VecTy;
449 }
450
451 /// Return the type converted to an equivalently sized integer or vector
452 /// with integer element type. Similar to changeVectorElementTypeToInteger,
453 /// but also handles scalars.
454 MVT changeTypeToInteger() {
455 if (isVector())
456 return changeVectorElementTypeToInteger();
457 return MVT::getIntegerVT(getSizeInBits());
458 }
459
460 /// Return a VT for a vector type with the same element type but
461 /// half the number of elements.
462 MVT getHalfNumVectorElementsVT() const {
463 MVT EltVT = getVectorElementType();
464 auto EltCnt = getVectorElementCount();
465 assert(EltCnt.isKnownEven() && "Splitting vector, but not in half!")((EltCnt.isKnownEven() && "Splitting vector, but not in half!"
) ? static_cast<void> (0) : __assert_fail ("EltCnt.isKnownEven() && \"Splitting vector, but not in half!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 465, __PRETTY_FUNCTION__))
;
466 return getVectorVT(EltVT, EltCnt.divideCoefficientBy(2));
467 }
468
469 /// Returns true if the given vector is a power of 2.
470 bool isPow2VectorType() const {
471 unsigned NElts = getVectorNumElements();
472 return !(NElts & (NElts - 1));
473 }
474
475 /// Widens the length of the given vector MVT up to the nearest power of 2
476 /// and returns that type.
477 MVT getPow2VectorType() const {
478 if (isPow2VectorType())
479 return *this;
480
481 unsigned NElts = getVectorNumElements();
482 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
483 return MVT::getVectorVT(getVectorElementType(), Pow2NElts);
484 }
485
486 /// If this is a vector, return the element type, otherwise return this.
487 MVT getScalarType() const {
488 return isVector() ? getVectorElementType() : *this;
489 }
490
491 MVT getVectorElementType() const {
492 switch (SimpleTy) {
493 default:
494 llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 494)
;
495 case v1i1:
496 case v2i1:
497 case v4i1:
498 case v8i1:
499 case v16i1:
500 case v32i1:
501 case v64i1:
502 case v128i1:
503 case v256i1:
504 case v512i1:
505 case v1024i1:
506 case nxv1i1:
507 case nxv2i1:
508 case nxv4i1:
509 case nxv8i1:
510 case nxv16i1:
511 case nxv32i1:
512 case nxv64i1: return i1;
513 case v1i8:
514 case v2i8:
515 case v4i8:
516 case v8i8:
517 case v16i8:
518 case v32i8:
519 case v64i8:
520 case v128i8:
521 case v256i8:
522 case nxv1i8:
523 case nxv2i8:
524 case nxv4i8:
525 case nxv8i8:
526 case nxv16i8:
527 case nxv32i8:
528 case nxv64i8: return i8;
529 case v1i16:
530 case v2i16:
531 case v3i16:
532 case v4i16:
533 case v8i16:
534 case v16i16:
535 case v32i16:
536 case v64i16:
537 case v128i16:
538 case nxv1i16:
539 case nxv2i16:
540 case nxv4i16:
541 case nxv8i16:
542 case nxv16i16:
543 case nxv32i16: return i16;
544 case v1i32:
545 case v2i32:
546 case v3i32:
547 case v4i32:
548 case v5i32:
549 case v8i32:
550 case v16i32:
551 case v32i32:
552 case v64i32:
553 case v128i32:
554 case v256i32:
555 case v512i32:
556 case v1024i32:
557 case v2048i32:
558 case nxv1i32:
559 case nxv2i32:
560 case nxv4i32:
561 case nxv8i32:
562 case nxv16i32:
563 case nxv32i32: return i32;
564 case v1i64:
565 case v2i64:
566 case v4i64:
567 case v8i64:
568 case v16i64:
569 case v32i64:
570 case v64i64:
571 case v128i64:
572 case v256i64:
573 case nxv1i64:
574 case nxv2i64:
575 case nxv4i64:
576 case nxv8i64:
577 case nxv16i64:
578 case nxv32i64: return i64;
579 case v1i128: return i128;
580 case v2f16:
581 case v3f16:
582 case v4f16:
583 case v8f16:
584 case v16f16:
585 case v32f16:
586 case v64f16:
587 case v128f16:
588 case nxv1f16:
589 case nxv2f16:
590 case nxv4f16:
591 case nxv8f16:
592 case nxv16f16:
593 case nxv32f16: return f16;
594 case v2bf16:
595 case v3bf16:
596 case v4bf16:
597 case v8bf16:
598 case v16bf16:
599 case v32bf16:
600 case v64bf16:
601 case v128bf16:
602 case nxv2bf16:
603 case nxv4bf16:
604 case nxv8bf16: return bf16;
605 case v1f32:
606 case v2f32:
607 case v3f32:
608 case v4f32:
609 case v5f32:
610 case v8f32:
611 case v16f32:
612 case v32f32:
613 case v64f32:
614 case v128f32:
615 case v256f32:
616 case v512f32:
617 case v1024f32:
618 case v2048f32:
619 case nxv1f32:
620 case nxv2f32:
621 case nxv4f32:
622 case nxv8f32:
623 case nxv16f32: return f32;
624 case v1f64:
625 case v2f64:
626 case v4f64:
627 case v8f64:
628 case v16f64:
629 case v32f64:
630 case v64f64:
631 case v128f64:
632 case v256f64:
633 case nxv1f64:
634 case nxv2f64:
635 case nxv4f64:
636 case nxv8f64: return f64;
637 }
638 }
639
640 unsigned getVectorNumElements() const {
641 switch (SimpleTy) {
642 default:
643 llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 643)
;
644 case v2048i32:
645 case v2048f32: return 2048;
646 case v1024i1:
647 case v1024i32:
648 case v1024f32: return 1024;
649 case v512i1:
650 case v512i32:
651 case v512f32: return 512;
652 case v256i1:
653 case v256i8:
654 case v256i32:
655 case v256i64:
656 case v256f32:
657 case v256f64: return 256;
658 case v128i1:
659 case v128i8:
660 case v128i16:
661 case v128i32:
662 case v128i64:
663 case v128f16:
664 case v128bf16:
665 case v128f32:
666 case v128f64: return 128;
667 case v64i1:
668 case v64i8:
669 case v64i16:
670 case v64i32:
671 case v64i64:
672 case v64f16:
673 case v64bf16:
674 case v64f32:
675 case v64f64:
676 case nxv64i1:
677 case nxv64i8: return 64;
678 case v32i1:
679 case v32i8:
680 case v32i16:
681 case v32i32:
682 case v32i64:
683 case v32f16:
684 case v32bf16:
685 case v32f32:
686 case v32f64:
687 case nxv32i1:
688 case nxv32i8:
689 case nxv32i16:
690 case nxv32i32:
691 case nxv32i64:
692 case nxv32f16: return 32;
693 case v16i1:
694 case v16i8:
695 case v16i16:
696 case v16i32:
697 case v16i64:
698 case v16f16:
699 case v16bf16:
700 case v16f32:
701 case v16f64:
702 case nxv16i1:
703 case nxv16i8:
704 case nxv16i16:
705 case nxv16i32:
706 case nxv16i64:
707 case nxv16f16:
708 case nxv16f32: return 16;
709 case v8i1:
710 case v8i8:
711 case v8i16:
712 case v8i32:
713 case v8i64:
714 case v8f16:
715 case v8bf16:
716 case v8f32:
717 case v8f64:
718 case nxv8i1:
719 case nxv8i8:
720 case nxv8i16:
721 case nxv8i32:
722 case nxv8i64:
723 case nxv8f16:
724 case nxv8bf16:
725 case nxv8f32:
726 case nxv8f64: return 8;
727 case v5i32:
728 case v5f32: return 5;
729 case v4i1:
730 case v4i8:
731 case v4i16:
732 case v4i32:
733 case v4i64:
734 case v4f16:
735 case v4bf16:
736 case v4f32:
737 case v4f64:
738 case nxv4i1:
739 case nxv4i8:
740 case nxv4i16:
741 case nxv4i32:
742 case nxv4i64:
743 case nxv4f16:
744 case nxv4bf16:
745 case nxv4f32:
746 case nxv4f64: return 4;
747 case v3i16:
748 case v3i32:
749 case v3f16:
750 case v3bf16:
751 case v3f32: return 3;
752 case v2i1:
753 case v2i8:
754 case v2i16:
755 case v2i32:
756 case v2i64:
757 case v2f16:
758 case v2bf16:
759 case v2f32:
760 case v2f64:
761 case nxv2i1:
762 case nxv2i8:
763 case nxv2i16:
764 case nxv2i32:
765 case nxv2i64:
766 case nxv2f16:
767 case nxv2bf16:
768 case nxv2f32:
769 case nxv2f64: return 2;
770 case v1i1:
771 case v1i8:
772 case v1i16:
773 case v1i32:
774 case v1i64:
775 case v1i128:
776 case v1f32:
777 case v1f64:
778 case nxv1i1:
779 case nxv1i8:
780 case nxv1i16:
781 case nxv1i32:
782 case nxv1i64:
783 case nxv1f16:
784 case nxv1f32:
785 case nxv1f64: return 1;
786 }
787 }
788
789 ElementCount getVectorElementCount() const {
790 return ElementCount::get(getVectorNumElements(), isScalableVector());
791 }
792
793 /// Given a vector type, return the minimum number of elements it contains.
794 unsigned getVectorMinNumElements() const {
795 return getVectorElementCount().getKnownMinValue();
796 }
797
798 /// Returns the size of the specified MVT in bits.
799 ///
800 /// If the value type is a scalable vector type, the scalable property will
801 /// be set and the runtime size will be a positive integer multiple of the
802 /// base size.
803 TypeSize getSizeInBits() const {
804 switch (SimpleTy) {
805 default:
806 llvm_unreachable("getSizeInBits called on extended MVT.")::llvm::llvm_unreachable_internal("getSizeInBits called on extended MVT."
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 806)
;
807 case Other:
808 llvm_unreachable("Value type is non-standard value, Other.")::llvm::llvm_unreachable_internal("Value type is non-standard value, Other."
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 808)
;
809 case iPTR:
810 llvm_unreachable("Value type size is target-dependent. Ask TLI.")::llvm::llvm_unreachable_internal("Value type size is target-dependent. Ask TLI."
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 810)
;
811 case iPTRAny:
812 case iAny:
813 case fAny:
814 case vAny:
815 case Any:
816 llvm_unreachable("Value type is overloaded.")::llvm::llvm_unreachable_internal("Value type is overloaded."
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 816)
;
817 case token:
818 llvm_unreachable("Token type is a sentinel that cannot be used "::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used "
"in codegen and has no size", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 819)
819 "in codegen and has no size")::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used "
"in codegen and has no size", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 819)
;
820 case Metadata:
821 llvm_unreachable("Value type is metadata.")::llvm::llvm_unreachable_internal("Value type is metadata.", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 821)
;
822 case i1:
823 case v1i1: return TypeSize::Fixed(1);
824 case nxv1i1: return TypeSize::Scalable(1);
825 case v2i1: return TypeSize::Fixed(2);
826 case nxv2i1: return TypeSize::Scalable(2);
827 case v4i1: return TypeSize::Fixed(4);
828 case nxv4i1: return TypeSize::Scalable(4);
829 case i8 :
830 case v1i8:
831 case v8i1: return TypeSize::Fixed(8);
832 case nxv1i8:
833 case nxv8i1: return TypeSize::Scalable(8);
834 case i16 :
835 case f16:
836 case bf16:
837 case v16i1:
838 case v2i8:
839 case v1i16: return TypeSize::Fixed(16);
840 case nxv16i1:
841 case nxv2i8:
842 case nxv1i16:
843 case nxv1f16: return TypeSize::Scalable(16);
844 case f32 :
845 case i32 :
846 case v32i1:
847 case v4i8:
848 case v2i16:
849 case v2f16:
850 case v2bf16:
851 case v1f32:
852 case v1i32: return TypeSize::Fixed(32);
853 case nxv32i1:
854 case nxv4i8:
855 case nxv2i16:
856 case nxv1i32:
857 case nxv2f16:
858 case nxv2bf16:
859 case nxv1f32: return TypeSize::Scalable(32);
860 case v3i16:
861 case v3f16:
862 case v3bf16: return TypeSize::Fixed(48);
863 case x86mmx:
864 case f64 :
865 case i64 :
866 case v64i1:
867 case v8i8:
868 case v4i16:
869 case v2i32:
870 case v1i64:
871 case v4f16:
872 case v4bf16:
873 case v2f32:
874 case v1f64: return TypeSize::Fixed(64);
875 case nxv64i1:
876 case nxv8i8:
877 case nxv4i16:
878 case nxv2i32:
879 case nxv1i64:
880 case nxv4f16:
881 case nxv4bf16:
882 case nxv2f32:
883 case nxv1f64: return TypeSize::Scalable(64);
884 case f80 : return TypeSize::Fixed(80);
885 case v3i32:
886 case v3f32: return TypeSize::Fixed(96);
887 case f128:
888 case ppcf128:
889 case i128:
890 case v128i1:
891 case v16i8:
892 case v8i16:
893 case v4i32:
894 case v2i64:
895 case v1i128:
896 case v8f16:
897 case v8bf16:
898 case v4f32:
899 case v2f64: return TypeSize::Fixed(128);
900 case nxv16i8:
901 case nxv8i16:
902 case nxv4i32:
903 case nxv2i64:
904 case nxv8f16:
905 case nxv8bf16:
906 case nxv4f32:
907 case nxv2f64: return TypeSize::Scalable(128);
908 case v5i32:
909 case v5f32: return TypeSize::Fixed(160);
910 case v256i1:
911 case v32i8:
912 case v16i16:
913 case v8i32:
914 case v4i64:
915 case v16f16:
916 case v16bf16:
917 case v8f32:
918 case v4f64: return TypeSize::Fixed(256);
919 case nxv32i8:
920 case nxv16i16:
921 case nxv8i32:
922 case nxv4i64:
923 case nxv16f16:
924 case nxv8f32:
925 case nxv4f64: return TypeSize::Scalable(256);
926 case v512i1:
927 case v64i8:
928 case v32i16:
929 case v16i32:
930 case v8i64:
931 case v32f16:
932 case v32bf16:
933 case v16f32:
934 case v8f64: return TypeSize::Fixed(512);
935 case nxv64i8:
936 case nxv32i16:
937 case nxv16i32:
938 case nxv8i64:
939 case nxv32f16:
940 case nxv16f32:
941 case nxv8f64: return TypeSize::Scalable(512);
942 case v1024i1:
943 case v128i8:
944 case v64i16:
945 case v32i32:
946 case v16i64:
947 case v64f16:
948 case v64bf16:
949 case v32f32:
950 case v16f64: return TypeSize::Fixed(1024);
951 case nxv32i32:
952 case nxv16i64: return TypeSize::Scalable(1024);
953 case v256i8:
954 case v128i16:
955 case v64i32:
956 case v32i64:
957 case v128f16:
958 case v128bf16:
959 case v64f32:
960 case v32f64: return TypeSize::Fixed(2048);
961 case nxv32i64: return TypeSize::Scalable(2048);
962 case v128i32:
963 case v64i64:
964 case v128f32:
965 case v64f64: return TypeSize::Fixed(4096);
966 case v256i32:
967 case v128i64:
968 case v256f32:
969 case v128f64: return TypeSize::Fixed(8192);
970 case v512i32:
971 case v256i64:
972 case v512f32:
973 case v256f64: return TypeSize::Fixed(16384);
974 case v1024i32:
975 case v1024f32: return TypeSize::Fixed(32768);
976 case v2048i32:
977 case v2048f32: return TypeSize::Fixed(65536);
978 case exnref:
979 case funcref:
980 case externref: return TypeSize::Fixed(0); // opaque type
981 }
982 }
983
984 /// Return the size of the specified fixed width value type in bits. The
985 /// function will assert if the type is scalable.
986 uint64_t getFixedSizeInBits() const {
987 return getSizeInBits().getFixedSize();
988 }
989
990 uint64_t getScalarSizeInBits() const {
991 return getScalarType().getSizeInBits().getFixedSize();
992 }
993
994 /// Return the number of bytes overwritten by a store of the specified value
995 /// type.
996 ///
997 /// If the value type is a scalable vector type, the scalable property will
998 /// be set and the runtime size will be a positive integer multiple of the
999 /// base size.
1000 TypeSize getStoreSize() const {
1001 TypeSize BaseSize = getSizeInBits();
1002 return {(BaseSize.getKnownMinSize() + 7) / 8, BaseSize.isScalable()};
1003 }
1004
1005 /// Return the number of bits overwritten by a store of the specified value
1006 /// type.
1007 ///
1008 /// If the value type is a scalable vector type, the scalable property will
1009 /// be set and the runtime size will be a positive integer multiple of the
1010 /// base size.
1011 TypeSize getStoreSizeInBits() const {
1012 return getStoreSize() * 8;
1013 }
1014
1015 /// Returns true if the number of bits for the type is a multiple of an
1016 /// 8-bit byte.
1017 bool isByteSized() const { return getSizeInBits().isKnownMultipleOf(8); }
1018
1019 /// Return true if we know at compile time this has more bits than VT.
1020 bool knownBitsGT(MVT VT) const {
1021 return TypeSize::isKnownGT(getSizeInBits(), VT.getSizeInBits());
1022 }
1023
1024 /// Return true if we know at compile time this has more than or the same
1025 /// bits as VT.
1026 bool knownBitsGE(MVT VT) const {
1027 return TypeSize::isKnownGE(getSizeInBits(), VT.getSizeInBits());
1028 }
1029
1030 /// Return true if we know at compile time this has fewer bits than VT.
1031 bool knownBitsLT(MVT VT) const {
1032 return TypeSize::isKnownLT(getSizeInBits(), VT.getSizeInBits());
1033 }
1034
1035 /// Return true if we know at compile time this has fewer than or the same
1036 /// bits as VT.
1037 bool knownBitsLE(MVT VT) const {
1038 return TypeSize::isKnownLE(getSizeInBits(), VT.getSizeInBits());
1039 }
1040
1041 /// Return true if this has more bits than VT.
1042 bool bitsGT(MVT VT) const {
1043 assert(isScalableVector() == VT.isScalableVector() &&((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 1044, __PRETTY_FUNCTION__))
1044 "Comparison between scalable and fixed types")((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 1044, __PRETTY_FUNCTION__))
;
1045 return knownBitsGT(VT);
1046 }
1047
1048 /// Return true if this has no less bits than VT.
1049 bool bitsGE(MVT VT) const {
1050 assert(isScalableVector() == VT.isScalableVector() &&((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 1051, __PRETTY_FUNCTION__))
1051 "Comparison between scalable and fixed types")((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 1051, __PRETTY_FUNCTION__))
;
1052 return knownBitsGE(VT);
1053 }
1054
1055 /// Return true if this has less bits than VT.
1056 bool bitsLT(MVT VT) const {
1057 assert(isScalableVector() == VT.isScalableVector() &&((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 1058, __PRETTY_FUNCTION__))
1058 "Comparison between scalable and fixed types")((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 1058, __PRETTY_FUNCTION__))
;
1059 return knownBitsLT(VT);
1060 }
1061
1062 /// Return true if this has no more bits than VT.
1063 bool bitsLE(MVT VT) const {
1064 assert(isScalableVector() == VT.isScalableVector() &&((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 1065, __PRETTY_FUNCTION__))
1065 "Comparison between scalable and fixed types")((isScalableVector() == VT.isScalableVector() && "Comparison between scalable and fixed types"
) ? static_cast<void> (0) : __assert_fail ("isScalableVector() == VT.isScalableVector() && \"Comparison between scalable and fixed types\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 1065, __PRETTY_FUNCTION__))
;
1066 return knownBitsLE(VT);
1067 }
1068
1069 static MVT getFloatingPointVT(unsigned BitWidth) {
1070 switch (BitWidth) {
1071 default:
1072 llvm_unreachable("Bad bit width!")::llvm::llvm_unreachable_internal("Bad bit width!", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 1072)
;
1073 case 16:
1074 return MVT::f16;
1075 case 32:
1076 return MVT::f32;
1077 case 64:
1078 return MVT::f64;
1079 case 80:
1080 return MVT::f80;
1081 case 128:
1082 return MVT::f128;
1083 }
1084 }
1085
1086 static MVT getIntegerVT(unsigned BitWidth) {
1087 switch (BitWidth) {
1088 default:
1089 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
1090 case 1:
1091 return MVT::i1;
1092 case 8:
1093 return MVT::i8;
1094 case 16:
1095 return MVT::i16;
1096 case 32:
1097 return MVT::i32;
1098 case 64:
1099 return MVT::i64;
1100 case 128:
1101 return MVT::i128;
1102 }
1103 }
1104
1105 static MVT getVectorVT(MVT VT, unsigned NumElements) {
1106 switch (VT.SimpleTy) {
1107 default:
1108 break;
1109 case MVT::i1:
1110 if (NumElements == 1) return MVT::v1i1;
1111 if (NumElements == 2) return MVT::v2i1;
1112 if (NumElements == 4) return MVT::v4i1;
1113 if (NumElements == 8) return MVT::v8i1;
1114 if (NumElements == 16) return MVT::v16i1;
1115 if (NumElements == 32) return MVT::v32i1;
1116 if (NumElements == 64) return MVT::v64i1;
1117 if (NumElements == 128) return MVT::v128i1;
1118 if (NumElements == 256) return MVT::v256i1;
1119 if (NumElements == 512) return MVT::v512i1;
1120 if (NumElements == 1024) return MVT::v1024i1;
1121 break;
1122 case MVT::i8:
1123 if (NumElements == 1) return MVT::v1i8;
1124 if (NumElements == 2) return MVT::v2i8;
1125 if (NumElements == 4) return MVT::v4i8;
1126 if (NumElements == 8) return MVT::v8i8;
1127 if (NumElements == 16) return MVT::v16i8;
1128 if (NumElements == 32) return MVT::v32i8;
1129 if (NumElements == 64) return MVT::v64i8;
1130 if (NumElements == 128) return MVT::v128i8;
1131 if (NumElements == 256) return MVT::v256i8;
1132 break;
1133 case MVT::i16:
1134 if (NumElements == 1) return MVT::v1i16;
1135 if (NumElements == 2) return MVT::v2i16;
1136 if (NumElements == 3) return MVT::v3i16;
1137 if (NumElements == 4) return MVT::v4i16;
1138 if (NumElements == 8) return MVT::v8i16;
1139 if (NumElements == 16) return MVT::v16i16;
1140 if (NumElements == 32) return MVT::v32i16;
1141 if (NumElements == 64) return MVT::v64i16;
1142 if (NumElements == 128) return MVT::v128i16;
1143 break;
1144 case MVT::i32:
1145 if (NumElements == 1) return MVT::v1i32;
1146 if (NumElements == 2) return MVT::v2i32;
1147 if (NumElements == 3) return MVT::v3i32;
1148 if (NumElements == 4) return MVT::v4i32;
1149 if (NumElements == 5) return MVT::v5i32;
1150 if (NumElements == 8) return MVT::v8i32;
1151 if (NumElements == 16) return MVT::v16i32;
1152 if (NumElements == 32) return MVT::v32i32;
1153 if (NumElements == 64) return MVT::v64i32;
1154 if (NumElements == 128) return MVT::v128i32;
1155 if (NumElements == 256) return MVT::v256i32;
1156 if (NumElements == 512) return MVT::v512i32;
1157 if (NumElements == 1024) return MVT::v1024i32;
1158 if (NumElements == 2048) return MVT::v2048i32;
1159 break;
1160 case MVT::i64:
1161 if (NumElements == 1) return MVT::v1i64;
1162 if (NumElements == 2) return MVT::v2i64;
1163 if (NumElements == 4) return MVT::v4i64;
1164 if (NumElements == 8) return MVT::v8i64;
1165 if (NumElements == 16) return MVT::v16i64;
1166 if (NumElements == 32) return MVT::v32i64;
1167 if (NumElements == 64) return MVT::v64i64;
1168 if (NumElements == 128) return MVT::v128i64;
1169 if (NumElements == 256) return MVT::v256i64;
1170 break;
1171 case MVT::i128:
1172 if (NumElements == 1) return MVT::v1i128;
1173 break;
1174 case MVT::f16:
1175 if (NumElements == 2) return MVT::v2f16;
1176 if (NumElements == 3) return MVT::v3f16;
1177 if (NumElements == 4) return MVT::v4f16;
1178 if (NumElements == 8) return MVT::v8f16;
1179 if (NumElements == 16) return MVT::v16f16;
1180 if (NumElements == 32) return MVT::v32f16;
1181 if (NumElements == 64) return MVT::v64f16;
1182 if (NumElements == 128) return MVT::v128f16;
1183 break;
1184 case MVT::bf16:
1185 if (NumElements == 2) return MVT::v2bf16;
1186 if (NumElements == 3) return MVT::v3bf16;
1187 if (NumElements == 4) return MVT::v4bf16;
1188 if (NumElements == 8) return MVT::v8bf16;
1189 if (NumElements == 16) return MVT::v16bf16;
1190 if (NumElements == 32) return MVT::v32bf16;
1191 if (NumElements == 64) return MVT::v64bf16;
1192 if (NumElements == 128) return MVT::v128bf16;
1193 break;
1194 case MVT::f32:
1195 if (NumElements == 1) return MVT::v1f32;
1196 if (NumElements == 2) return MVT::v2f32;
1197 if (NumElements == 3) return MVT::v3f32;
1198 if (NumElements == 4) return MVT::v4f32;
1199 if (NumElements == 5) return MVT::v5f32;
1200 if (NumElements == 8) return MVT::v8f32;
1201 if (NumElements == 16) return MVT::v16f32;
1202 if (NumElements == 32) return MVT::v32f32;
1203 if (NumElements == 64) return MVT::v64f32;
1204 if (NumElements == 128) return MVT::v128f32;
1205 if (NumElements == 256) return MVT::v256f32;
1206 if (NumElements == 512) return MVT::v512f32;
1207 if (NumElements == 1024) return MVT::v1024f32;
1208 if (NumElements == 2048) return MVT::v2048f32;
1209 break;
1210 case MVT::f64:
1211 if (NumElements == 1) return MVT::v1f64;
1212 if (NumElements == 2) return MVT::v2f64;
1213 if (NumElements == 4) return MVT::v4f64;
1214 if (NumElements == 8) return MVT::v8f64;
1215 if (NumElements == 16) return MVT::v16f64;
1216 if (NumElements == 32) return MVT::v32f64;
1217 if (NumElements == 64) return MVT::v64f64;
1218 if (NumElements == 128) return MVT::v128f64;
1219 if (NumElements == 256) return MVT::v256f64;
1220 break;
1221 }
1222 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
1223 }
1224
1225 static MVT getScalableVectorVT(MVT VT, unsigned NumElements) {
1226 switch(VT.SimpleTy) {
1227 default:
1228 break;
1229 case MVT::i1:
1230 if (NumElements == 1) return MVT::nxv1i1;
1231 if (NumElements == 2) return MVT::nxv2i1;
1232 if (NumElements == 4) return MVT::nxv4i1;
1233 if (NumElements == 8) return MVT::nxv8i1;
1234 if (NumElements == 16) return MVT::nxv16i1;
1235 if (NumElements == 32) return MVT::nxv32i1;
1236 if (NumElements == 64) return MVT::nxv64i1;
1237 break;
1238 case MVT::i8:
1239 if (NumElements == 1) return MVT::nxv1i8;
1240 if (NumElements == 2) return MVT::nxv2i8;
1241 if (NumElements == 4) return MVT::nxv4i8;
1242 if (NumElements == 8) return MVT::nxv8i8;
1243 if (NumElements == 16) return MVT::nxv16i8;
1244 if (NumElements == 32) return MVT::nxv32i8;
1245 if (NumElements == 64) return MVT::nxv64i8;
1246 break;
1247 case MVT::i16:
1248 if (NumElements == 1) return MVT::nxv1i16;
1249 if (NumElements == 2) return MVT::nxv2i16;
1250 if (NumElements == 4) return MVT::nxv4i16;
1251 if (NumElements == 8) return MVT::nxv8i16;
1252 if (NumElements == 16) return MVT::nxv16i16;
1253 if (NumElements == 32) return MVT::nxv32i16;
1254 break;
1255 case MVT::i32:
1256 if (NumElements == 1) return MVT::nxv1i32;
1257 if (NumElements == 2) return MVT::nxv2i32;
1258 if (NumElements == 4) return MVT::nxv4i32;
1259 if (NumElements == 8) return MVT::nxv8i32;
1260 if (NumElements == 16) return MVT::nxv16i32;
1261 if (NumElements == 32) return MVT::nxv32i32;
1262 break;
1263 case MVT::i64:
1264 if (NumElements == 1) return MVT::nxv1i64;
1265 if (NumElements == 2) return MVT::nxv2i64;
1266 if (NumElements == 4) return MVT::nxv4i64;
1267 if (NumElements == 8) return MVT::nxv8i64;
1268 if (NumElements == 16) return MVT::nxv16i64;
1269 if (NumElements == 32) return MVT::nxv32i64;
1270 break;
1271 case MVT::f16:
1272 if (NumElements == 1) return MVT::nxv1f16;
1273 if (NumElements == 2) return MVT::nxv2f16;
1274 if (NumElements == 4) return MVT::nxv4f16;
1275 if (NumElements == 8) return MVT::nxv8f16;
1276 if (NumElements == 16) return MVT::nxv16f16;
1277 if (NumElements == 32) return MVT::nxv32f16;
1278 break;
1279 case MVT::bf16:
1280 if (NumElements == 2) return MVT::nxv2bf16;
1281 if (NumElements == 4) return MVT::nxv4bf16;
1282 if (NumElements == 8) return MVT::nxv8bf16;
1283 break;
1284 case MVT::f32:
1285 if (NumElements == 1) return MVT::nxv1f32;
1286 if (NumElements == 2) return MVT::nxv2f32;
1287 if (NumElements == 4) return MVT::nxv4f32;
1288 if (NumElements == 8) return MVT::nxv8f32;
1289 if (NumElements == 16) return MVT::nxv16f32;
1290 break;
1291 case MVT::f64:
1292 if (NumElements == 1) return MVT::nxv1f64;
1293 if (NumElements == 2) return MVT::nxv2f64;
1294 if (NumElements == 4) return MVT::nxv4f64;
1295 if (NumElements == 8) return MVT::nxv8f64;
1296 break;
1297 }
1298 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
1299 }
1300
1301 static MVT getVectorVT(MVT VT, unsigned NumElements, bool IsScalable) {
1302 if (IsScalable)
1303 return getScalableVectorVT(VT, NumElements);
1304 return getVectorVT(VT, NumElements);
1305 }
1306
1307 static MVT getVectorVT(MVT VT, ElementCount EC) {
1308 if (EC.isScalable())
1309 return getScalableVectorVT(VT, EC.getKnownMinValue());
1310 return getVectorVT(VT, EC.getKnownMinValue());
1311 }
1312
1313 /// Return the value type corresponding to the specified type. This returns
1314 /// all pointers as iPTR. If HandleUnknown is true, unknown types are
1315 /// returned as Other, otherwise they are invalid.
1316 static MVT getVT(Type *Ty, bool HandleUnknown = false);
1317
1318 private:
1319 /// A simple iterator over the MVT::SimpleValueType enum.
1320 struct mvt_iterator {
1321 SimpleValueType VT;
1322
1323 mvt_iterator(SimpleValueType VT) : VT(VT) {}
1324
1325 MVT operator*() const { return VT; }
1326 bool operator!=(const mvt_iterator &LHS) const { return VT != LHS.VT; }
1327
1328 mvt_iterator& operator++() {
1329 VT = (MVT::SimpleValueType)((int)VT + 1);
1330 assert((int)VT <= MVT::MAX_ALLOWED_VALUETYPE &&(((int)VT <= MVT::MAX_ALLOWED_VALUETYPE && "MVT iterator overflowed."
) ? static_cast<void> (0) : __assert_fail ("(int)VT <= MVT::MAX_ALLOWED_VALUETYPE && \"MVT iterator overflowed.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 1331, __PRETTY_FUNCTION__))
1331 "MVT iterator overflowed.")(((int)VT <= MVT::MAX_ALLOWED_VALUETYPE && "MVT iterator overflowed."
) ? static_cast<void> (0) : __assert_fail ("(int)VT <= MVT::MAX_ALLOWED_VALUETYPE && \"MVT iterator overflowed.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/Support/MachineValueType.h"
, 1331, __PRETTY_FUNCTION__))
;
1332 return *this;
1333 }
1334 };
1335
1336 /// A range of the MVT::SimpleValueType enum.
1337 using mvt_range = iterator_range<mvt_iterator>;
1338
1339 public:
1340 /// SimpleValueType Iteration
1341 /// @{
1342 static mvt_range all_valuetypes() {
1343 return mvt_range(MVT::FIRST_VALUETYPE, MVT::LAST_VALUETYPE);
1344 }
1345
1346 static mvt_range integer_valuetypes() {
1347 return mvt_range(MVT::FIRST_INTEGER_VALUETYPE,
1348 (MVT::SimpleValueType)(MVT::LAST_INTEGER_VALUETYPE + 1));
1349 }
1350
1351 static mvt_range fp_valuetypes() {
1352 return mvt_range(MVT::FIRST_FP_VALUETYPE,
1353 (MVT::SimpleValueType)(MVT::LAST_FP_VALUETYPE + 1));
1354 }
1355
1356 static mvt_range vector_valuetypes() {
1357 return mvt_range(MVT::FIRST_VECTOR_VALUETYPE,
1358 (MVT::SimpleValueType)(MVT::LAST_VECTOR_VALUETYPE + 1));
1359 }
1360
1361 static mvt_range fixedlen_vector_valuetypes() {
1362 return mvt_range(
1363 MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE,
1364 (MVT::SimpleValueType)(MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE + 1));
1365 }
1366
1367 static mvt_range scalable_vector_valuetypes() {
1368 return mvt_range(
1369 MVT::FIRST_SCALABLE_VECTOR_VALUETYPE,
1370 (MVT::SimpleValueType)(MVT::LAST_SCALABLE_VECTOR_VALUETYPE + 1));
1371 }
1372
1373 static mvt_range integer_fixedlen_vector_valuetypes() {
1374 return mvt_range(
1375 MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE,
1376 (MVT::SimpleValueType)(MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE + 1));
1377 }
1378
1379 static mvt_range fp_fixedlen_vector_valuetypes() {
1380 return mvt_range(
1381 MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE,
1382 (MVT::SimpleValueType)(MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE + 1));
1383 }
1384
1385 static mvt_range integer_scalable_vector_valuetypes() {
1386 return mvt_range(
1387 MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE,
1388 (MVT::SimpleValueType)(MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE + 1));
1389 }
1390
1391 static mvt_range fp_scalable_vector_valuetypes() {
1392 return mvt_range(
1393 MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE,
1394 (MVT::SimpleValueType)(MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE + 1));
1395 }
1396 /// @}
1397 };
1398
1399} // end namespace llvm
1400
1401#endif // LLVM_CODEGEN_MACHINEVALUETYPE_H

/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h

1//===- llvm/ADT/STLExtras.h - Useful STL related functions ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some templates that are useful if you are working with the
10// STL at all.
11//
12// No library is required when using these functions.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_ADT_STLEXTRAS_H
17#define LLVM_ADT_STLEXTRAS_H
18
19#include "llvm/ADT/Optional.h"
20#include "llvm/ADT/iterator.h"
21#include "llvm/ADT/iterator_range.h"
22#include "llvm/Config/abi-breaking.h"
23#include "llvm/Support/ErrorHandling.h"
24#include <algorithm>
25#include <cassert>
26#include <cstddef>
27#include <cstdint>
28#include <cstdlib>
29#include <functional>
30#include <initializer_list>
31#include <iterator>
32#include <limits>
33#include <memory>
34#include <tuple>
35#include <type_traits>
36#include <utility>
37
38#ifdef EXPENSIVE_CHECKS
39#include <random> // for std::mt19937
40#endif
41
42namespace llvm {
43
44// Only used by compiler if both template types are the same. Useful when
45// using SFINAE to test for the existence of member functions.
46template <typename T, T> struct SameType;
47
48namespace detail {
49
50template <typename RangeT>
51using IterOfRange = decltype(std::begin(std::declval<RangeT &>()));
52
53template <typename RangeT>
54using ValueOfRange = typename std::remove_reference<decltype(
55 *std::begin(std::declval<RangeT &>()))>::type;
56
57} // end namespace detail
58
59//===----------------------------------------------------------------------===//
60// Extra additions to <type_traits>
61//===----------------------------------------------------------------------===//
62
63template <typename T>
64struct negation : std::integral_constant<bool, !bool(T::value)> {};
65
66template <typename...> struct conjunction : std::true_type {};
67template <typename B1> struct conjunction<B1> : B1 {};
68template <typename B1, typename... Bn>
69struct conjunction<B1, Bn...>
70 : std::conditional<bool(B1::value), conjunction<Bn...>, B1>::type {};
71
72template <typename T> struct make_const_ptr {
73 using type =
74 typename std::add_pointer<typename std::add_const<T>::type>::type;
75};
76
77template <typename T> struct make_const_ref {
78 using type = typename std::add_lvalue_reference<
79 typename std::add_const<T>::type>::type;
80};
81
82/// Utilities for detecting if a given trait holds for some set of arguments
83/// 'Args'. For example, the given trait could be used to detect if a given type
84/// has a copy assignment operator:
85/// template<class T>
86/// using has_copy_assign_t = decltype(std::declval<T&>()
87/// = std::declval<const T&>());
88/// bool fooHasCopyAssign = is_detected<has_copy_assign_t, FooClass>::value;
89namespace detail {
90template <typename...> using void_t = void;
91template <class, template <class...> class Op, class... Args> struct detector {
92 using value_t = std::false_type;
93};
94template <template <class...> class Op, class... Args>
95struct detector<void_t<Op<Args...>>, Op, Args...> {
96 using value_t = std::true_type;
97};
98} // end namespace detail
99
100template <template <class...> class Op, class... Args>
101using is_detected = typename detail::detector<void, Op, Args...>::value_t;
102
103/// Check if a Callable type can be invoked with the given set of arg types.
104namespace detail {
105template <typename Callable, typename... Args>
106using is_invocable =
107 decltype(std::declval<Callable &>()(std::declval<Args>()...));
108} // namespace detail
109
110template <typename Callable, typename... Args>
111using is_invocable = is_detected<detail::is_invocable, Callable, Args...>;
112
113/// This class provides various trait information about a callable object.
114/// * To access the number of arguments: Traits::num_args
115/// * To access the type of an argument: Traits::arg_t<Index>
116/// * To access the type of the result: Traits::result_t
117template <typename T, bool isClass = std::is_class<T>::value>
118struct function_traits : public function_traits<decltype(&T::operator())> {};
119
120/// Overload for class function types.
121template <typename ClassType, typename ReturnType, typename... Args>
122struct function_traits<ReturnType (ClassType::*)(Args...) const, false> {
123 /// The number of arguments to this function.
124 enum { num_args = sizeof...(Args) };
125
126 /// The result type of this function.
127 using result_t = ReturnType;
128
129 /// The type of an argument to this function.
130 template <size_t Index>
131 using arg_t = typename std::tuple_element<Index, std::tuple<Args...>>::type;
132};
133/// Overload for class function types.
134template <typename ClassType, typename ReturnType, typename... Args>
135struct function_traits<ReturnType (ClassType::*)(Args...), false>
136 : function_traits<ReturnType (ClassType::*)(Args...) const> {};
137/// Overload for non-class function types.
138template <typename ReturnType, typename... Args>
139struct function_traits<ReturnType (*)(Args...), false> {
140 /// The number of arguments to this function.
141 enum { num_args = sizeof...(Args) };
142
143 /// The result type of this function.
144 using result_t = ReturnType;
145
146 /// The type of an argument to this function.
147 template <size_t i>
148 using arg_t = typename std::tuple_element<i, std::tuple<Args...>>::type;
149};
150/// Overload for non-class function type references.
151template <typename ReturnType, typename... Args>
152struct function_traits<ReturnType (&)(Args...), false>
153 : public function_traits<ReturnType (*)(Args...)> {};
154
155//===----------------------------------------------------------------------===//
156// Extra additions to <functional>
157//===----------------------------------------------------------------------===//
158
159template <class Ty> struct identity {
160 using argument_type = Ty;
161
162 Ty &operator()(Ty &self) const {
163 return self;
164 }
165 const Ty &operator()(const Ty &self) const {
166 return self;
167 }
168};
169
170/// An efficient, type-erasing, non-owning reference to a callable. This is
171/// intended for use as the type of a function parameter that is not used
172/// after the function in question returns.
173///
174/// This class does not own the callable, so it is not in general safe to store
175/// a function_ref.
176template<typename Fn> class function_ref;
177
178template<typename Ret, typename ...Params>
179class function_ref<Ret(Params...)> {
180 Ret (*callback)(intptr_t callable, Params ...params) = nullptr;
181 intptr_t callable;
182
183 template<typename Callable>
184 static Ret callback_fn(intptr_t callable, Params ...params) {
185 return (*reinterpret_cast<Callable*>(callable))(
186 std::forward<Params>(params)...);
187 }
188
189public:
190 function_ref() = default;
191 function_ref(std::nullptr_t) {}
192
193 template <typename Callable>
194 function_ref(
195 Callable &&callable,
196 // This is not the copy-constructor.
197 std::enable_if_t<
198 !std::is_same<std::remove_cv_t<std::remove_reference_t<Callable>>,
199 function_ref>::value> * = nullptr,
200 // Functor must be callable and return a suitable type.
201 std::enable_if_t<std::is_void<Ret>::value ||
202 std::is_convertible<decltype(std::declval<Callable>()(
203 std::declval<Params>()...)),
204 Ret>::value> * = nullptr)
205 : callback(callback_fn<typename std::remove_reference<Callable>::type>),
206 callable(reinterpret_cast<intptr_t>(&callable)) {}
207
208 Ret operator()(Params ...params) const {
209 return callback(callable, std::forward<Params>(params)...);
210 }
211
212 explicit operator bool() const { return callback; }
213};
214
215//===----------------------------------------------------------------------===//
216// Extra additions to <iterator>
217//===----------------------------------------------------------------------===//
218
219namespace adl_detail {
220
221using std::begin;
222
223template <typename ContainerTy>
224decltype(auto) adl_begin(ContainerTy &&container) {
225 return begin(std::forward<ContainerTy>(container));
226}
227
228using std::end;
229
230template <typename ContainerTy>
231decltype(auto) adl_end(ContainerTy &&container) {
232 return end(std::forward<ContainerTy>(container));
233}
234
235using std::swap;
236
237template <typename T>
238void adl_swap(T &&lhs, T &&rhs) noexcept(noexcept(swap(std::declval<T>(),
239 std::declval<T>()))) {
240 swap(std::forward<T>(lhs), std::forward<T>(rhs));
241}
242
243} // end namespace adl_detail
244
245template <typename ContainerTy>
246decltype(auto) adl_begin(ContainerTy &&container) {
247 return adl_detail::adl_begin(std::forward<ContainerTy>(container));
248}
249
250template <typename ContainerTy>
251decltype(auto) adl_end(ContainerTy &&container) {
252 return adl_detail::adl_end(std::forward<ContainerTy>(container));
253}
254
255template <typename T>
256void adl_swap(T &&lhs, T &&rhs) noexcept(
257 noexcept(adl_detail::adl_swap(std::declval<T>(), std::declval<T>()))) {
258 adl_detail::adl_swap(std::forward<T>(lhs), std::forward<T>(rhs));
259}
260
261/// Test whether \p RangeOrContainer is empty. Similar to C++17 std::empty.
262template <typename T>
263constexpr bool empty(const T &RangeOrContainer) {
264 return adl_begin(RangeOrContainer) == adl_end(RangeOrContainer);
265}
266
267/// Returns true if the given container only contains a single element.
268template <typename ContainerTy> bool hasSingleElement(ContainerTy &&C) {
269 auto B = std::begin(C), E = std::end(C);
270 return B != E && std::next(B) == E;
271}
272
273/// Return a range covering \p RangeOrContainer with the first N elements
274/// excluded.
275template <typename T> auto drop_begin(T &&RangeOrContainer, size_t N) {
276 return make_range(std::next(adl_begin(RangeOrContainer), N),
277 adl_end(RangeOrContainer));
278}
279
280// mapped_iterator - This is a simple iterator adapter that causes a function to
281// be applied whenever operator* is invoked on the iterator.
282
283template <typename ItTy, typename FuncTy,
284 typename FuncReturnTy =
285 decltype(std::declval<FuncTy>()(*std::declval<ItTy>()))>
286class mapped_iterator
287 : public iterator_adaptor_base<
288 mapped_iterator<ItTy, FuncTy>, ItTy,
289 typename std::iterator_traits<ItTy>::iterator_category,
290 typename std::remove_reference<FuncReturnTy>::type> {
291public:
292 mapped_iterator(ItTy U, FuncTy F)
293 : mapped_iterator::iterator_adaptor_base(std::move(U)), F(std::move(F)) {}
294
295 ItTy getCurrent() { return this->I; }
296
297 FuncReturnTy operator*() const { return F(*this->I); }
298
299private:
300 FuncTy F;
301};
302
303// map_iterator - Provide a convenient way to create mapped_iterators, just like
304// make_pair is useful for creating pairs...
305template <class ItTy, class FuncTy>
306inline mapped_iterator<ItTy, FuncTy> map_iterator(ItTy I, FuncTy F) {
307 return mapped_iterator<ItTy, FuncTy>(std::move(I), std::move(F));
308}
309
310template <class ContainerTy, class FuncTy>
311auto map_range(ContainerTy &&C, FuncTy F) {
312 return make_range(map_iterator(C.begin(), F), map_iterator(C.end(), F));
313}
314
315/// Helper to determine if type T has a member called rbegin().
316template <typename Ty> class has_rbegin_impl {
317 using yes = char[1];
318 using no = char[2];
319
320 template <typename Inner>
321 static yes& test(Inner *I, decltype(I->rbegin()) * = nullptr);
322
323 template <typename>
324 static no& test(...);
325
326public:
327 static const bool value = sizeof(test<Ty>(nullptr)) == sizeof(yes);
328};
329
330/// Metafunction to determine if T& or T has a member called rbegin().
331template <typename Ty>
332struct has_rbegin : has_rbegin_impl<typename std::remove_reference<Ty>::type> {
333};
334
335// Returns an iterator_range over the given container which iterates in reverse.
336// Note that the container must have rbegin()/rend() methods for this to work.
337template <typename ContainerTy>
338auto reverse(ContainerTy &&C,
339 std::enable_if_t<has_rbegin<ContainerTy>::value> * = nullptr) {
340 return make_range(C.rbegin(), C.rend());
341}
342
343// Returns a std::reverse_iterator wrapped around the given iterator.
344template <typename IteratorTy>
345std::reverse_iterator<IteratorTy> make_reverse_iterator(IteratorTy It) {
346 return std::reverse_iterator<IteratorTy>(It);
347}
348
349// Returns an iterator_range over the given container which iterates in reverse.
350// Note that the container must have begin()/end() methods which return
351// bidirectional iterators for this to work.
352template <typename ContainerTy>
353auto reverse(ContainerTy &&C,
354 std::enable_if_t<!has_rbegin<ContainerTy>::value> * = nullptr) {
355 return make_range(llvm::make_reverse_iterator(std::end(C)),
356 llvm::make_reverse_iterator(std::begin(C)));
357}
358
359/// An iterator adaptor that filters the elements of given inner iterators.
360///
361/// The predicate parameter should be a callable object that accepts the wrapped
362/// iterator's reference type and returns a bool. When incrementing or
363/// decrementing the iterator, it will call the predicate on each element and
364/// skip any where it returns false.
365///
366/// \code
367/// int A[] = { 1, 2, 3, 4 };
368/// auto R = make_filter_range(A, [](int N) { return N % 2 == 1; });
369/// // R contains { 1, 3 }.
370/// \endcode
371///
372/// Note: filter_iterator_base implements support for forward iteration.
373/// filter_iterator_impl exists to provide support for bidirectional iteration,
374/// conditional on whether the wrapped iterator supports it.
375template <typename WrappedIteratorT, typename PredicateT, typename IterTag>
376class filter_iterator_base
377 : public iterator_adaptor_base<
378 filter_iterator_base<WrappedIteratorT, PredicateT, IterTag>,
379 WrappedIteratorT,
380 typename std::common_type<
381 IterTag, typename std::iterator_traits<
382 WrappedIteratorT>::iterator_category>::type> {
383 using BaseT = iterator_adaptor_base<
384 filter_iterator_base<WrappedIteratorT, PredicateT, IterTag>,
385 WrappedIteratorT,
386 typename std::common_type<
387 IterTag, typename std::iterator_traits<
388 WrappedIteratorT>::iterator_category>::type>;
389
390protected:
391 WrappedIteratorT End;
392 PredicateT Pred;
393
394 void findNextValid() {
395 while (this->I != End && !Pred(*this->I))
396 BaseT::operator++();
397 }
398
399 // Construct the iterator. The begin iterator needs to know where the end
400 // is, so that it can properly stop when it gets there. The end iterator only
401 // needs the predicate to support bidirectional iteration.
402 filter_iterator_base(WrappedIteratorT Begin, WrappedIteratorT End,
403 PredicateT Pred)
404 : BaseT(Begin), End(End), Pred(Pred) {
405 findNextValid();
406 }
407
408public:
409 using BaseT::operator++;
410
411 filter_iterator_base &operator++() {
412 BaseT::operator++();
413 findNextValid();
414 return *this;
415 }
416};
417
418/// Specialization of filter_iterator_base for forward iteration only.
419template <typename WrappedIteratorT, typename PredicateT,
420 typename IterTag = std::forward_iterator_tag>
421class filter_iterator_impl
422 : public filter_iterator_base<WrappedIteratorT, PredicateT, IterTag> {
423 using BaseT = filter_iterator_base<WrappedIteratorT, PredicateT, IterTag>;
424
425public:
426 filter_iterator_impl(WrappedIteratorT Begin, WrappedIteratorT End,
427 PredicateT Pred)
428 : BaseT(Begin, End, Pred) {}
429};
430
431/// Specialization of filter_iterator_base for bidirectional iteration.
432template <typename WrappedIteratorT, typename PredicateT>
433class filter_iterator_impl<WrappedIteratorT, PredicateT,
434 std::bidirectional_iterator_tag>
435 : public filter_iterator_base<WrappedIteratorT, PredicateT,
436 std::bidirectional_iterator_tag> {
437 using BaseT = filter_iterator_base<WrappedIteratorT, PredicateT,
438 std::bidirectional_iterator_tag>;
439 void findPrevValid() {
440 while (!this->Pred(*this->I))
441 BaseT::operator--();
442 }
443
444public:
445 using BaseT::operator--;
446
447 filter_iterator_impl(WrappedIteratorT Begin, WrappedIteratorT End,
448 PredicateT Pred)
449 : BaseT(Begin, End, Pred) {}
450
451 filter_iterator_impl &operator--() {
452 BaseT::operator--();
453 findPrevValid();
454 return *this;
455 }
456};
457
458namespace detail {
459
460template <bool is_bidirectional> struct fwd_or_bidi_tag_impl {
461 using type = std::forward_iterator_tag;
462};
463
464template <> struct fwd_or_bidi_tag_impl<true> {
465 using type = std::bidirectional_iterator_tag;
466};
467
468/// Helper which sets its type member to forward_iterator_tag if the category
469/// of \p IterT does not derive from bidirectional_iterator_tag, and to
470/// bidirectional_iterator_tag otherwise.
471template <typename IterT> struct fwd_or_bidi_tag {
472 using type = typename fwd_or_bidi_tag_impl<std::is_base_of<
473 std::bidirectional_iterator_tag,
474 typename std::iterator_traits<IterT>::iterator_category>::value>::type;
475};
476
477} // namespace detail
478
479/// Defines filter_iterator to a suitable specialization of
480/// filter_iterator_impl, based on the underlying iterator's category.
481template <typename WrappedIteratorT, typename PredicateT>
482using filter_iterator = filter_iterator_impl<
483 WrappedIteratorT, PredicateT,
484 typename detail::fwd_or_bidi_tag<WrappedIteratorT>::type>;
485
486/// Convenience function that takes a range of elements and a predicate,
487/// and return a new filter_iterator range.
488///
489/// FIXME: Currently if RangeT && is a rvalue reference to a temporary, the
490/// lifetime of that temporary is not kept by the returned range object, and the
491/// temporary is going to be dropped on the floor after the make_iterator_range
492/// full expression that contains this function call.
493template <typename RangeT, typename PredicateT>
494iterator_range<filter_iterator<detail::IterOfRange<RangeT>, PredicateT>>
495make_filter_range(RangeT &&Range, PredicateT Pred) {
496 using FilterIteratorT =
497 filter_iterator<detail::IterOfRange<RangeT>, PredicateT>;
498 return make_range(
499 FilterIteratorT(std::begin(std::forward<RangeT>(Range)),
500 std::end(std::forward<RangeT>(Range)), Pred),
501 FilterIteratorT(std::end(std::forward<RangeT>(Range)),
502 std::end(std::forward<RangeT>(Range)), Pred));
503}
504
505/// A pseudo-iterator adaptor that is designed to implement "early increment"
506/// style loops.
507///
508/// This is *not a normal iterator* and should almost never be used directly. It
509/// is intended primarily to be used with range based for loops and some range
510/// algorithms.
511///
512/// The iterator isn't quite an `OutputIterator` or an `InputIterator` but
513/// somewhere between them. The constraints of these iterators are:
514///
515/// - On construction or after being incremented, it is comparable and
516/// dereferencable. It is *not* incrementable.
517/// - After being dereferenced, it is neither comparable nor dereferencable, it
518/// is only incrementable.
519///
520/// This means you can only dereference the iterator once, and you can only
521/// increment it once between dereferences.
522template <typename WrappedIteratorT>
523class early_inc_iterator_impl
524 : public iterator_adaptor_base<early_inc_iterator_impl<WrappedIteratorT>,
525 WrappedIteratorT, std::input_iterator_tag> {
526 using BaseT =
527 iterator_adaptor_base<early_inc_iterator_impl<WrappedIteratorT>,
528 WrappedIteratorT, std::input_iterator_tag>;
529
530 using PointerT = typename std::iterator_traits<WrappedIteratorT>::pointer;
531
532protected:
533#if LLVM_ENABLE_ABI_BREAKING_CHECKS1
534 bool IsEarlyIncremented = false;
535#endif
536
537public:
538 early_inc_iterator_impl(WrappedIteratorT I) : BaseT(I) {}
539
540 using BaseT::operator*;
541 typename BaseT::reference operator*() {
542#if LLVM_ENABLE_ABI_BREAKING_CHECKS1
543 assert(!IsEarlyIncremented && "Cannot dereference twice!")((!IsEarlyIncremented && "Cannot dereference twice!")
? static_cast<void> (0) : __assert_fail ("!IsEarlyIncremented && \"Cannot dereference twice!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 543, __PRETTY_FUNCTION__))
;
544 IsEarlyIncremented = true;
545#endif
546 return *(this->I)++;
547 }
548
549 using BaseT::operator++;
550 early_inc_iterator_impl &operator++() {
551#if LLVM_ENABLE_ABI_BREAKING_CHECKS1
552 assert(IsEarlyIncremented && "Cannot increment before dereferencing!")((IsEarlyIncremented && "Cannot increment before dereferencing!"
) ? static_cast<void> (0) : __assert_fail ("IsEarlyIncremented && \"Cannot increment before dereferencing!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 552, __PRETTY_FUNCTION__))
;
553 IsEarlyIncremented = false;
554#endif
555 return *this;
556 }
557
558 using BaseT::operator==;
559 bool operator==(const early_inc_iterator_impl &RHS) const {
560#if LLVM_ENABLE_ABI_BREAKING_CHECKS1
561 assert(!IsEarlyIncremented && "Cannot compare after dereferencing!")((!IsEarlyIncremented && "Cannot compare after dereferencing!"
) ? static_cast<void> (0) : __assert_fail ("!IsEarlyIncremented && \"Cannot compare after dereferencing!\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 561, __PRETTY_FUNCTION__))
;
562#endif
563 return BaseT::operator==(RHS);
564 }
565};
566
567/// Make a range that does early increment to allow mutation of the underlying
568/// range without disrupting iteration.
569///
570/// The underlying iterator will be incremented immediately after it is
571/// dereferenced, allowing deletion of the current node or insertion of nodes to
572/// not disrupt iteration provided they do not invalidate the *next* iterator --
573/// the current iterator can be invalidated.
574///
575/// This requires a very exact pattern of use that is only really suitable to
576/// range based for loops and other range algorithms that explicitly guarantee
577/// to dereference exactly once each element, and to increment exactly once each
578/// element.
579template <typename RangeT>
580iterator_range<early_inc_iterator_impl<detail::IterOfRange<RangeT>>>
581make_early_inc_range(RangeT &&Range) {
582 using EarlyIncIteratorT =
583 early_inc_iterator_impl<detail::IterOfRange<RangeT>>;
584 return make_range(EarlyIncIteratorT(std::begin(std::forward<RangeT>(Range))),
585 EarlyIncIteratorT(std::end(std::forward<RangeT>(Range))));
586}
587
588// forward declarations required by zip_shortest/zip_first/zip_longest
589template <typename R, typename UnaryPredicate>
590bool all_of(R &&range, UnaryPredicate P);
591template <typename R, typename UnaryPredicate>
592bool any_of(R &&range, UnaryPredicate P);
593
594namespace detail {
595
596using std::declval;
597
598// We have to alias this since inlining the actual type at the usage site
599// in the parameter list of iterator_facade_base<> below ICEs MSVC 2017.
600template<typename... Iters> struct ZipTupleType {
601 using type = std::tuple<decltype(*declval<Iters>())...>;
602};
603
604template <typename ZipType, typename... Iters>
605using zip_traits = iterator_facade_base<
606 ZipType, typename std::common_type<std::bidirectional_iterator_tag,
607 typename std::iterator_traits<
608 Iters>::iterator_category...>::type,
609 // ^ TODO: Implement random access methods.
610 typename ZipTupleType<Iters...>::type,
611 typename std::iterator_traits<typename std::tuple_element<
612 0, std::tuple<Iters...>>::type>::difference_type,
613 // ^ FIXME: This follows boost::make_zip_iterator's assumption that all
614 // inner iterators have the same difference_type. It would fail if, for
615 // instance, the second field's difference_type were non-numeric while the
616 // first is.
617 typename ZipTupleType<Iters...>::type *,
618 typename ZipTupleType<Iters...>::type>;
619
620template <typename ZipType, typename... Iters>
621struct zip_common : public zip_traits<ZipType, Iters...> {
622 using Base = zip_traits<ZipType, Iters...>;
623 using value_type = typename Base::value_type;
624
625 std::tuple<Iters...> iterators;
626
627protected:
628 template <size_t... Ns> value_type deref(std::index_sequence<Ns...>) const {
629 return value_type(*std::get<Ns>(iterators)...);
630 }
631
632 template <size_t... Ns>
633 decltype(iterators) tup_inc(std::index_sequence<Ns...>) const {
634 return std::tuple<Iters...>(std::next(std::get<Ns>(iterators))...);
635 }
636
637 template <size_t... Ns>
638 decltype(iterators) tup_dec(std::index_sequence<Ns...>) const {
639 return std::tuple<Iters...>(std::prev(std::get<Ns>(iterators))...);
640 }
641
642public:
643 zip_common(Iters &&... ts) : iterators(std::forward<Iters>(ts)...) {}
644
645 value_type operator*() { return deref(std::index_sequence_for<Iters...>{}); }
646
647 const value_type operator*() const {
648 return deref(std::index_sequence_for<Iters...>{});
649 }
650
651 ZipType &operator++() {
652 iterators = tup_inc(std::index_sequence_for<Iters...>{});
653 return *reinterpret_cast<ZipType *>(this);
654 }
655
656 ZipType &operator--() {
657 static_assert(Base::IsBidirectional,
658 "All inner iterators must be at least bidirectional.");
659 iterators = tup_dec(std::index_sequence_for<Iters...>{});
660 return *reinterpret_cast<ZipType *>(this);
661 }
662};
663
664template <typename... Iters>
665struct zip_first : public zip_common<zip_first<Iters...>, Iters...> {
666 using Base = zip_common<zip_first<Iters...>, Iters...>;
667
668 bool operator==(const zip_first<Iters...> &other) const {
669 return std::get<0>(this->iterators) == std::get<0>(other.iterators);
670 }
671
672 zip_first(Iters &&... ts) : Base(std::forward<Iters>(ts)...) {}
673};
674
675template <typename... Iters>
676class zip_shortest : public zip_common<zip_shortest<Iters...>, Iters...> {
677 template <size_t... Ns>
678 bool test(const zip_shortest<Iters...> &other,
679 std::index_sequence<Ns...>) const {
680 return all_of(std::initializer_list<bool>{std::get<Ns>(this->iterators) !=
681 std::get<Ns>(other.iterators)...},
682 identity<bool>{});
683 }
684
685public:
686 using Base = zip_common<zip_shortest<Iters...>, Iters...>;
687
688 zip_shortest(Iters &&... ts) : Base(std::forward<Iters>(ts)...) {}
689
690 bool operator==(const zip_shortest<Iters...> &other) const {
691 return !test(other, std::index_sequence_for<Iters...>{});
692 }
693};
694
695template <template <typename...> class ItType, typename... Args> class zippy {
696public:
697 using iterator = ItType<decltype(std::begin(std::declval<Args>()))...>;
698 using iterator_category = typename iterator::iterator_category;
699 using value_type = typename iterator::value_type;
700 using difference_type = typename iterator::difference_type;
701 using pointer = typename iterator::pointer;
702 using reference = typename iterator::reference;
703
704private:
705 std::tuple<Args...> ts;
706
707 template <size_t... Ns>
708 iterator begin_impl(std::index_sequence<Ns...>) const {
709 return iterator(std::begin(std::get<Ns>(ts))...);
710 }
711 template <size_t... Ns> iterator end_impl(std::index_sequence<Ns...>) const {
712 return iterator(std::end(std::get<Ns>(ts))...);
713 }
714
715public:
716 zippy(Args &&... ts_) : ts(std::forward<Args>(ts_)...) {}
717
718 iterator begin() const {
719 return begin_impl(std::index_sequence_for<Args...>{});
720 }
721 iterator end() const { return end_impl(std::index_sequence_for<Args...>{}); }
722};
723
724} // end namespace detail
725
726/// zip iterator for two or more iteratable types.
727template <typename T, typename U, typename... Args>
728detail::zippy<detail::zip_shortest, T, U, Args...> zip(T &&t, U &&u,
729 Args &&... args) {
730 return detail::zippy<detail::zip_shortest, T, U, Args...>(
731 std::forward<T>(t), std::forward<U>(u), std::forward<Args>(args)...);
732}
733
734/// zip iterator that, for the sake of efficiency, assumes the first iteratee to
735/// be the shortest.
736template <typename T, typename U, typename... Args>
737detail::zippy<detail::zip_first, T, U, Args...> zip_first(T &&t, U &&u,
738 Args &&... args) {
739 return detail::zippy<detail::zip_first, T, U, Args...>(
740 std::forward<T>(t), std::forward<U>(u), std::forward<Args>(args)...);
741}
742
743namespace detail {
744template <typename Iter>
745Iter next_or_end(const Iter &I, const Iter &End) {
746 if (I == End)
747 return End;
748 return std::next(I);
749}
750
751template <typename Iter>
752auto deref_or_none(const Iter &I, const Iter &End) -> llvm::Optional<
753 std::remove_const_t<std::remove_reference_t<decltype(*I)>>> {
754 if (I == End)
755 return None;
756 return *I;
757}
758
759template <typename Iter> struct ZipLongestItemType {
760 using type =
761 llvm::Optional<typename std::remove_const<typename std::remove_reference<
762 decltype(*std::declval<Iter>())>::type>::type>;
763};
764
765template <typename... Iters> struct ZipLongestTupleType {
766 using type = std::tuple<typename ZipLongestItemType<Iters>::type...>;
767};
768
769template <typename... Iters>
770class zip_longest_iterator
771 : public iterator_facade_base<
772 zip_longest_iterator<Iters...>,
773 typename std::common_type<
774 std::forward_iterator_tag,
775 typename std::iterator_traits<Iters>::iterator_category...>::type,
776 typename ZipLongestTupleType<Iters...>::type,
777 typename std::iterator_traits<typename std::tuple_element<
778 0, std::tuple<Iters...>>::type>::difference_type,
779 typename ZipLongestTupleType<Iters...>::type *,
780 typename ZipLongestTupleType<Iters...>::type> {
781public:
782 using value_type = typename ZipLongestTupleType<Iters...>::type;
783
784private:
785 std::tuple<Iters...> iterators;
786 std::tuple<Iters...> end_iterators;
787
788 template <size_t... Ns>
789 bool test(const zip_longest_iterator<Iters...> &other,
790 std::index_sequence<Ns...>) const {
791 return llvm::any_of(
792 std::initializer_list<bool>{std::get<Ns>(this->iterators) !=
793 std::get<Ns>(other.iterators)...},
794 identity<bool>{});
795 }
796
797 template <size_t... Ns> value_type deref(std::index_sequence<Ns...>) const {
798 return value_type(
799 deref_or_none(std::get<Ns>(iterators), std::get<Ns>(end_iterators))...);
800 }
801
802 template <size_t... Ns>
803 decltype(iterators) tup_inc(std::index_sequence<Ns...>) const {
804 return std::tuple<Iters...>(
805 next_or_end(std::get<Ns>(iterators), std::get<Ns>(end_iterators))...);
806 }
807
808public:
809 zip_longest_iterator(std::pair<Iters &&, Iters &&>... ts)
810 : iterators(std::forward<Iters>(ts.first)...),
811 end_iterators(std::forward<Iters>(ts.second)...) {}
812
813 value_type operator*() { return deref(std::index_sequence_for<Iters...>{}); }
814
815 value_type operator*() const {
816 return deref(std::index_sequence_for<Iters...>{});
817 }
818
819 zip_longest_iterator<Iters...> &operator++() {
820 iterators = tup_inc(std::index_sequence_for<Iters...>{});
821 return *this;
822 }
823
824 bool operator==(const zip_longest_iterator<Iters...> &other) const {
825 return !test(other, std::index_sequence_for<Iters...>{});
826 }
827};
828
829template <typename... Args> class zip_longest_range {
830public:
831 using iterator =
832 zip_longest_iterator<decltype(adl_begin(std::declval<Args>()))...>;
833 using iterator_category = typename iterator::iterator_category;
834 using value_type = typename iterator::value_type;
835 using difference_type = typename iterator::difference_type;
836 using pointer = typename iterator::pointer;
837 using reference = typename iterator::reference;
838
839private:
840 std::tuple<Args...> ts;
841
842 template <size_t... Ns>
843 iterator begin_impl(std::index_sequence<Ns...>) const {
844 return iterator(std::make_pair(adl_begin(std::get<Ns>(ts)),
845 adl_end(std::get<Ns>(ts)))...);
846 }
847
848 template <size_t... Ns> iterator end_impl(std::index_sequence<Ns...>) const {
849 return iterator(std::make_pair(adl_end(std::get<Ns>(ts)),
850 adl_end(std::get<Ns>(ts)))...);
851 }
852
853public:
854 zip_longest_range(Args &&... ts_) : ts(std::forward<Args>(ts_)...) {}
855
856 iterator begin() const {
857 return begin_impl(std::index_sequence_for<Args...>{});
858 }
859 iterator end() const { return end_impl(std::index_sequence_for<Args...>{}); }
860};
861} // namespace detail
862
863/// Iterate over two or more iterators at the same time. Iteration continues
864/// until all iterators reach the end. The llvm::Optional only contains a value
865/// if the iterator has not reached the end.
866template <typename T, typename U, typename... Args>
867detail::zip_longest_range<T, U, Args...> zip_longest(T &&t, U &&u,
868 Args &&... args) {
869 return detail::zip_longest_range<T, U, Args...>(
870 std::forward<T>(t), std::forward<U>(u), std::forward<Args>(args)...);
871}
872
873/// Iterator wrapper that concatenates sequences together.
874///
875/// This can concatenate different iterators, even with different types, into
876/// a single iterator provided the value types of all the concatenated
877/// iterators expose `reference` and `pointer` types that can be converted to
878/// `ValueT &` and `ValueT *` respectively. It doesn't support more
879/// interesting/customized pointer or reference types.
880///
881/// Currently this only supports forward or higher iterator categories as
882/// inputs and always exposes a forward iterator interface.
883template <typename ValueT, typename... IterTs>
884class concat_iterator
885 : public iterator_facade_base<concat_iterator<ValueT, IterTs...>,
886 std::forward_iterator_tag, ValueT> {
887 using BaseT = typename concat_iterator::iterator_facade_base;
888
889 /// We store both the current and end iterators for each concatenated
890 /// sequence in a tuple of pairs.
891 ///
892 /// Note that something like iterator_range seems nice at first here, but the
893 /// range properties are of little benefit and end up getting in the way
894 /// because we need to do mutation on the current iterators.
895 std::tuple<IterTs...> Begins;
896 std::tuple<IterTs...> Ends;
897
898 /// Attempts to increment a specific iterator.
899 ///
900 /// Returns true if it was able to increment the iterator. Returns false if
901 /// the iterator is already at the end iterator.
902 template <size_t Index> bool incrementHelper() {
903 auto &Begin = std::get<Index>(Begins);
904 auto &End = std::get<Index>(Ends);
905 if (Begin == End)
906 return false;
907
908 ++Begin;
909 return true;
910 }
911
912 /// Increments the first non-end iterator.
913 ///
914 /// It is an error to call this with all iterators at the end.
915 template <size_t... Ns> void increment(std::index_sequence<Ns...>) {
916 // Build a sequence of functions to increment each iterator if possible.
917 bool (concat_iterator::*IncrementHelperFns[])() = {
918 &concat_iterator::incrementHelper<Ns>...};
919
920 // Loop over them, and stop as soon as we succeed at incrementing one.
921 for (auto &IncrementHelperFn : IncrementHelperFns)
922 if ((this->*IncrementHelperFn)())
923 return;
924
925 llvm_unreachable("Attempted to increment an end concat iterator!")::llvm::llvm_unreachable_internal("Attempted to increment an end concat iterator!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 925)
;
926 }
927
928 /// Returns null if the specified iterator is at the end. Otherwise,
929 /// dereferences the iterator and returns the address of the resulting
930 /// reference.
931 template <size_t Index> ValueT *getHelper() const {
932 auto &Begin = std::get<Index>(Begins);
933 auto &End = std::get<Index>(Ends);
934 if (Begin == End)
935 return nullptr;
936
937 return &*Begin;
938 }
939
940 /// Finds the first non-end iterator, dereferences, and returns the resulting
941 /// reference.
942 ///
943 /// It is an error to call this with all iterators at the end.
944 template <size_t... Ns> ValueT &get(std::index_sequence<Ns...>) const {
945 // Build a sequence of functions to get from iterator if possible.
946 ValueT *(concat_iterator::*GetHelperFns[])() const = {
947 &concat_iterator::getHelper<Ns>...};
948
949 // Loop over them, and return the first result we find.
950 for (auto &GetHelperFn : GetHelperFns)
951 if (ValueT *P = (this->*GetHelperFn)())
952 return *P;
953
954 llvm_unreachable("Attempted to get a pointer from an end concat iterator!")::llvm::llvm_unreachable_internal("Attempted to get a pointer from an end concat iterator!"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 954)
;
955 }
956
957public:
958 /// Constructs an iterator from a sequence of ranges.
959 ///
960 /// We need the full range to know how to switch between each of the
961 /// iterators.
962 template <typename... RangeTs>
963 explicit concat_iterator(RangeTs &&... Ranges)
964 : Begins(std::begin(Ranges)...), Ends(std::end(Ranges)...) {}
965
966 using BaseT::operator++;
967
968 concat_iterator &operator++() {
969 increment(std::index_sequence_for<IterTs...>());
970 return *this;
971 }
972
973 ValueT &operator*() const {
974 return get(std::index_sequence_for<IterTs...>());
975 }
976
977 bool operator==(const concat_iterator &RHS) const {
978 return Begins == RHS.Begins && Ends == RHS.Ends;
979 }
980};
981
982namespace detail {
983
984/// Helper to store a sequence of ranges being concatenated and access them.
985///
986/// This is designed to facilitate providing actual storage when temporaries
987/// are passed into the constructor such that we can use it as part of range
988/// based for loops.
989template <typename ValueT, typename... RangeTs> class concat_range {
990public:
991 using iterator =
992 concat_iterator<ValueT,
993 decltype(std::begin(std::declval<RangeTs &>()))...>;
994
995private:
996 std::tuple<RangeTs...> Ranges;
997
998 template <size_t... Ns> iterator begin_impl(std::index_sequence<Ns...>) {
999 return iterator(std::get<Ns>(Ranges)...);
1000 }
1001 template <size_t... Ns> iterator end_impl(std::index_sequence<Ns...>) {
1002 return iterator(make_range(std::end(std::get<Ns>(Ranges)),
1003 std::end(std::get<Ns>(Ranges)))...);
1004 }
1005
1006public:
1007 concat_range(RangeTs &&... Ranges)
1008 : Ranges(std::forward<RangeTs>(Ranges)...) {}
1009
1010 iterator begin() { return begin_impl(std::index_sequence_for<RangeTs...>{}); }
1011 iterator end() { return end_impl(std::index_sequence_for<RangeTs...>{}); }
1012};
1013
1014} // end namespace detail
1015
1016/// Concatenated range across two or more ranges.
1017///
1018/// The desired value type must be explicitly specified.
1019template <typename ValueT, typename... RangeTs>
1020detail::concat_range<ValueT, RangeTs...> concat(RangeTs &&... Ranges) {
1021 static_assert(sizeof...(RangeTs) > 1,
1022 "Need more than one range to concatenate!");
1023 return detail::concat_range<ValueT, RangeTs...>(
1024 std::forward<RangeTs>(Ranges)...);
1025}
1026
1027/// A utility class used to implement an iterator that contains some base object
1028/// and an index. The iterator moves the index but keeps the base constant.
1029template <typename DerivedT, typename BaseT, typename T,
1030 typename PointerT = T *, typename ReferenceT = T &>
1031class indexed_accessor_iterator
1032 : public llvm::iterator_facade_base<DerivedT,
1033 std::random_access_iterator_tag, T,
1034 std::ptrdiff_t, PointerT, ReferenceT> {
1035public:
1036 ptrdiff_t operator-(const indexed_accessor_iterator &rhs) const {
1037 assert(base == rhs.base && "incompatible iterators")((base == rhs.base && "incompatible iterators") ? static_cast
<void> (0) : __assert_fail ("base == rhs.base && \"incompatible iterators\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1037, __PRETTY_FUNCTION__))
;
1038 return index - rhs.index;
1039 }
1040 bool operator==(const indexed_accessor_iterator &rhs) const {
1041 return base == rhs.base && index == rhs.index;
1042 }
1043 bool operator<(const indexed_accessor_iterator &rhs) const {
1044 assert(base == rhs.base && "incompatible iterators")((base == rhs.base && "incompatible iterators") ? static_cast
<void> (0) : __assert_fail ("base == rhs.base && \"incompatible iterators\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1044, __PRETTY_FUNCTION__))
;
1045 return index < rhs.index;
1046 }
1047
1048 DerivedT &operator+=(ptrdiff_t offset) {
1049 this->index += offset;
1050 return static_cast<DerivedT &>(*this);
1051 }
1052 DerivedT &operator-=(ptrdiff_t offset) {
1053 this->index -= offset;
1054 return static_cast<DerivedT &>(*this);
1055 }
1056
1057 /// Returns the current index of the iterator.
1058 ptrdiff_t getIndex() const { return index; }
1059
1060 /// Returns the current base of the iterator.
1061 const BaseT &getBase() const { return base; }
1062
1063protected:
1064 indexed_accessor_iterator(BaseT base, ptrdiff_t index)
1065 : base(base), index(index) {}
1066 BaseT base;
1067 ptrdiff_t index;
1068};
1069
1070namespace detail {
1071/// The class represents the base of a range of indexed_accessor_iterators. It
1072/// provides support for many different range functionalities, e.g.
1073/// drop_front/slice/etc.. Derived range classes must implement the following
1074/// static methods:
1075/// * ReferenceT dereference_iterator(const BaseT &base, ptrdiff_t index)
1076/// - Dereference an iterator pointing to the base object at the given
1077/// index.
1078/// * BaseT offset_base(const BaseT &base, ptrdiff_t index)
1079/// - Return a new base that is offset from the provide base by 'index'
1080/// elements.
1081template <typename DerivedT, typename BaseT, typename T,
1082 typename PointerT = T *, typename ReferenceT = T &>
1083class indexed_accessor_range_base {
1084public:
1085 using RangeBaseT =
1086 indexed_accessor_range_base<DerivedT, BaseT, T, PointerT, ReferenceT>;
1087
1088 /// An iterator element of this range.
1089 class iterator : public indexed_accessor_iterator<iterator, BaseT, T,
1090 PointerT, ReferenceT> {
1091 public:
1092 // Index into this iterator, invoking a static method on the derived type.
1093 ReferenceT operator*() const {
1094 return DerivedT::dereference_iterator(this->getBase(), this->getIndex());
1095 }
1096
1097 private:
1098 iterator(BaseT owner, ptrdiff_t curIndex)
1099 : indexed_accessor_iterator<iterator, BaseT, T, PointerT, ReferenceT>(
1100 owner, curIndex) {}
1101
1102 /// Allow access to the constructor.
1103 friend indexed_accessor_range_base<DerivedT, BaseT, T, PointerT,
1104 ReferenceT>;
1105 };
1106
1107 indexed_accessor_range_base(iterator begin, iterator end)
1108 : base(offset_base(begin.getBase(), begin.getIndex())),
1109 count(end.getIndex() - begin.getIndex()) {}
1110 indexed_accessor_range_base(const iterator_range<iterator> &range)
1111 : indexed_accessor_range_base(range.begin(), range.end()) {}
1112 indexed_accessor_range_base(BaseT base, ptrdiff_t count)
1113 : base(base), count(count) {}
1114
1115 iterator begin() const { return iterator(base, 0); }
1116 iterator end() const { return iterator(base, count); }
1117 ReferenceT operator[](unsigned index) const {
1118 assert(index < size() && "invalid index for value range")((index < size() && "invalid index for value range"
) ? static_cast<void> (0) : __assert_fail ("index < size() && \"invalid index for value range\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1118, __PRETTY_FUNCTION__))
;
1119 return DerivedT::dereference_iterator(base, index);
1120 }
1121 ReferenceT front() const {
1122 assert(!empty() && "expected non-empty range")((!empty() && "expected non-empty range") ? static_cast
<void> (0) : __assert_fail ("!empty() && \"expected non-empty range\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1122, __PRETTY_FUNCTION__))
;
1123 return (*this)[0];
1124 }
1125 ReferenceT back() const {
1126 assert(!empty() && "expected non-empty range")((!empty() && "expected non-empty range") ? static_cast
<void> (0) : __assert_fail ("!empty() && \"expected non-empty range\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1126, __PRETTY_FUNCTION__))
;
1127 return (*this)[size() - 1];
1128 }
1129
1130 /// Compare this range with another.
1131 template <typename OtherT> bool operator==(const OtherT &other) const {
1132 return size() ==
1133 static_cast<size_t>(std::distance(other.begin(), other.end())) &&
1134 std::equal(begin(), end(), other.begin());
1135 }
1136 template <typename OtherT> bool operator!=(const OtherT &other) const {
1137 return !(*this == other);
1138 }
1139
1140 /// Return the size of this range.
1141 size_t size() const { return count; }
1142
1143 /// Return if the range is empty.
1144 bool empty() const { return size() == 0; }
1145
1146 /// Drop the first N elements, and keep M elements.
1147 DerivedT slice(size_t n, size_t m) const {
1148 assert(n + m <= size() && "invalid size specifiers")((n + m <= size() && "invalid size specifiers") ? static_cast
<void> (0) : __assert_fail ("n + m <= size() && \"invalid size specifiers\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1148, __PRETTY_FUNCTION__))
;
1149 return DerivedT(offset_base(base, n), m);
1150 }
1151
1152 /// Drop the first n elements.
1153 DerivedT drop_front(size_t n = 1) const {
1154 assert(size() >= n && "Dropping more elements than exist")((size() >= n && "Dropping more elements than exist"
) ? static_cast<void> (0) : __assert_fail ("size() >= n && \"Dropping more elements than exist\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1154, __PRETTY_FUNCTION__))
;
1155 return slice(n, size() - n);
1156 }
1157 /// Drop the last n elements.
1158 DerivedT drop_back(size_t n = 1) const {
1159 assert(size() >= n && "Dropping more elements than exist")((size() >= n && "Dropping more elements than exist"
) ? static_cast<void> (0) : __assert_fail ("size() >= n && \"Dropping more elements than exist\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1159, __PRETTY_FUNCTION__))
;
1160 return DerivedT(base, size() - n);
1161 }
1162
1163 /// Take the first n elements.
1164 DerivedT take_front(size_t n = 1) const {
1165 return n < size() ? drop_back(size() - n)
1166 : static_cast<const DerivedT &>(*this);
1167 }
1168
1169 /// Take the last n elements.
1170 DerivedT take_back(size_t n = 1) const {
1171 return n < size() ? drop_front(size() - n)
1172 : static_cast<const DerivedT &>(*this);
1173 }
1174
1175 /// Allow conversion to any type accepting an iterator_range.
1176 template <typename RangeT, typename = std::enable_if_t<std::is_constructible<
1177 RangeT, iterator_range<iterator>>::value>>
1178 operator RangeT() const {
1179 return RangeT(iterator_range<iterator>(*this));
1180 }
1181
1182 /// Returns the base of this range.
1183 const BaseT &getBase() const { return base; }
1184
1185private:
1186 /// Offset the given base by the given amount.
1187 static BaseT offset_base(const BaseT &base, size_t n) {
1188 return n == 0 ? base : DerivedT::offset_base(base, n);
1189 }
1190
1191protected:
1192 indexed_accessor_range_base(const indexed_accessor_range_base &) = default;
1193 indexed_accessor_range_base(indexed_accessor_range_base &&) = default;
1194 indexed_accessor_range_base &
1195 operator=(const indexed_accessor_range_base &) = default;
1196
1197 /// The base that owns the provided range of values.
1198 BaseT base;
1199 /// The size from the owning range.
1200 ptrdiff_t count;
1201};
1202} // end namespace detail
1203
1204/// This class provides an implementation of a range of
1205/// indexed_accessor_iterators where the base is not indexable. Ranges with
1206/// bases that are offsetable should derive from indexed_accessor_range_base
1207/// instead. Derived range classes are expected to implement the following
1208/// static method:
1209/// * ReferenceT dereference(const BaseT &base, ptrdiff_t index)
1210/// - Dereference an iterator pointing to a parent base at the given index.
1211template <typename DerivedT, typename BaseT, typename T,
1212 typename PointerT = T *, typename ReferenceT = T &>
1213class indexed_accessor_range
1214 : public detail::indexed_accessor_range_base<
1215 DerivedT, std::pair<BaseT, ptrdiff_t>, T, PointerT, ReferenceT> {
1216public:
1217 indexed_accessor_range(BaseT base, ptrdiff_t startIndex, ptrdiff_t count)
1218 : detail::indexed_accessor_range_base<
1219 DerivedT, std::pair<BaseT, ptrdiff_t>, T, PointerT, ReferenceT>(
1220 std::make_pair(base, startIndex), count) {}
1221 using detail::indexed_accessor_range_base<
1222 DerivedT, std::pair<BaseT, ptrdiff_t>, T, PointerT,
1223 ReferenceT>::indexed_accessor_range_base;
1224
1225 /// Returns the current base of the range.
1226 const BaseT &getBase() const { return this->base.first; }
1227
1228 /// Returns the current start index of the range.
1229 ptrdiff_t getStartIndex() const { return this->base.second; }
1230
1231 /// See `detail::indexed_accessor_range_base` for details.
1232 static std::pair<BaseT, ptrdiff_t>
1233 offset_base(const std::pair<BaseT, ptrdiff_t> &base, ptrdiff_t index) {
1234 // We encode the internal base as a pair of the derived base and a start
1235 // index into the derived base.
1236 return std::make_pair(base.first, base.second + index);
1237 }
1238 /// See `detail::indexed_accessor_range_base` for details.
1239 static ReferenceT
1240 dereference_iterator(const std::pair<BaseT, ptrdiff_t> &base,
1241 ptrdiff_t index) {
1242 return DerivedT::dereference(base.first, base.second + index);
1243 }
1244};
1245
1246/// Given a container of pairs, return a range over the first elements.
1247template <typename ContainerTy> auto make_first_range(ContainerTy &&c) {
1248 return llvm::map_range(
1249 std::forward<ContainerTy>(c),
1250 [](decltype((*std::begin(c))) elt) -> decltype((elt.first)) {
1251 return elt.first;
1252 });
1253}
1254
1255/// Given a container of pairs, return a range over the second elements.
1256template <typename ContainerTy> auto make_second_range(ContainerTy &&c) {
1257 return llvm::map_range(
1258 std::forward<ContainerTy>(c),
1259 [](decltype((*std::begin(c))) elt) -> decltype((elt.second)) {
1260 return elt.second;
1261 });
1262}
1263
1264//===----------------------------------------------------------------------===//
1265// Extra additions to <utility>
1266//===----------------------------------------------------------------------===//
1267
1268/// Function object to check whether the first component of a std::pair
1269/// compares less than the first component of another std::pair.
1270struct less_first {
1271 template <typename T> bool operator()(const T &lhs, const T &rhs) const {
1272 return lhs.first < rhs.first;
1273 }
1274};
1275
1276/// Function object to check whether the second component of a std::pair
1277/// compares less than the second component of another std::pair.
1278struct less_second {
1279 template <typename T> bool operator()(const T &lhs, const T &rhs) const {
1280 return lhs.second < rhs.second;
1281 }
1282};
1283
1284/// \brief Function object to apply a binary function to the first component of
1285/// a std::pair.
1286template<typename FuncTy>
1287struct on_first {
1288 FuncTy func;
1289
1290 template <typename T>
1291 decltype(auto) operator()(const T &lhs, const T &rhs) const {
1292 return func(lhs.first, rhs.first);
1293 }
1294};
1295
1296/// Utility type to build an inheritance chain that makes it easy to rank
1297/// overload candidates.
1298template <int N> struct rank : rank<N - 1> {};
1299template <> struct rank<0> {};
1300
1301/// traits class for checking whether type T is one of any of the given
1302/// types in the variadic list.
1303template <typename T, typename... Ts> struct is_one_of {
1304 static const bool value = false;
1305};
1306
1307template <typename T, typename U, typename... Ts>
1308struct is_one_of<T, U, Ts...> {
1309 static const bool value =
1310 std::is_same<T, U>::value || is_one_of<T, Ts...>::value;
1311};
1312
1313/// traits class for checking whether type T is a base class for all
1314/// the given types in the variadic list.
1315template <typename T, typename... Ts> struct are_base_of {
1316 static const bool value = true;
1317};
1318
1319template <typename T, typename U, typename... Ts>
1320struct are_base_of<T, U, Ts...> {
1321 static const bool value =
1322 std::is_base_of<T, U>::value && are_base_of<T, Ts...>::value;
1323};
1324
1325//===----------------------------------------------------------------------===//
1326// Extra additions for arrays
1327//===----------------------------------------------------------------------===//
1328
1329// We have a copy here so that LLVM behaves the same when using different
1330// standard libraries.
1331template <class Iterator, class RNG>
1332void shuffle(Iterator first, Iterator last, RNG &&g) {
1333 // It would be better to use a std::uniform_int_distribution,
1334 // but that would be stdlib dependent.
1335 for (auto size = last - first; size > 1; ++first, (void)--size)
1336 std::iter_swap(first, first + g() % size);
1337}
1338
1339/// Find the length of an array.
1340template <class T, std::size_t N>
1341constexpr inline size_t array_lengthof(T (&)[N]) {
1342 return N;
1343}
1344
1345/// Adapt std::less<T> for array_pod_sort.
1346template<typename T>
1347inline int array_pod_sort_comparator(const void *P1, const void *P2) {
1348 if (std::less<T>()(*reinterpret_cast<const T*>(P1),
1349 *reinterpret_cast<const T*>(P2)))
1350 return -1;
1351 if (std::less<T>()(*reinterpret_cast<const T*>(P2),
1352 *reinterpret_cast<const T*>(P1)))
1353 return 1;
1354 return 0;
1355}
1356
1357/// get_array_pod_sort_comparator - This is an internal helper function used to
1358/// get type deduction of T right.
1359template<typename T>
1360inline int (*get_array_pod_sort_comparator(const T &))
1361 (const void*, const void*) {
1362 return array_pod_sort_comparator<T>;
1363}
1364
1365#ifdef EXPENSIVE_CHECKS
1366namespace detail {
1367
1368inline unsigned presortShuffleEntropy() {
1369 static unsigned Result(std::random_device{}());
1370 return Result;
1371}
1372
1373template <class IteratorTy>
1374inline void presortShuffle(IteratorTy Start, IteratorTy End) {
1375 std::mt19937 Generator(presortShuffleEntropy());
1376 std::shuffle(Start, End, Generator);
1377}
1378
1379} // end namespace detail
1380#endif
1381
1382/// array_pod_sort - This sorts an array with the specified start and end
1383/// extent. This is just like std::sort, except that it calls qsort instead of
1384/// using an inlined template. qsort is slightly slower than std::sort, but
1385/// most sorts are not performance critical in LLVM and std::sort has to be
1386/// template instantiated for each type, leading to significant measured code
1387/// bloat. This function should generally be used instead of std::sort where
1388/// possible.
1389///
1390/// This function assumes that you have simple POD-like types that can be
1391/// compared with std::less and can be moved with memcpy. If this isn't true,
1392/// you should use std::sort.
1393///
1394/// NOTE: If qsort_r were portable, we could allow a custom comparator and
1395/// default to std::less.
1396template<class IteratorTy>
1397inline void array_pod_sort(IteratorTy Start, IteratorTy End) {
1398 // Don't inefficiently call qsort with one element or trigger undefined
1399 // behavior with an empty sequence.
1400 auto NElts = End - Start;
1401 if (NElts <= 1) return;
1402#ifdef EXPENSIVE_CHECKS
1403 detail::presortShuffle<IteratorTy>(Start, End);
1404#endif
1405 qsort(&*Start, NElts, sizeof(*Start), get_array_pod_sort_comparator(*Start));
1406}
1407
1408template <class IteratorTy>
1409inline void array_pod_sort(
1410 IteratorTy Start, IteratorTy End,
1411 int (*Compare)(
1412 const typename std::iterator_traits<IteratorTy>::value_type *,
1413 const typename std::iterator_traits<IteratorTy>::value_type *)) {
1414 // Don't inefficiently call qsort with one element or trigger undefined
1415 // behavior with an empty sequence.
1416 auto NElts = End - Start;
1417 if (NElts <= 1) return;
1418#ifdef EXPENSIVE_CHECKS
1419 detail::presortShuffle<IteratorTy>(Start, End);
1420#endif
1421 qsort(&*Start, NElts, sizeof(*Start),
1422 reinterpret_cast<int (*)(const void *, const void *)>(Compare));
1423}
1424
1425namespace detail {
1426template <typename T>
1427// We can use qsort if the iterator type is a pointer and the underlying value
1428// is trivially copyable.
1429using sort_trivially_copyable = conjunction<
1430 std::is_pointer<T>,
1431 std::is_trivially_copyable<typename std::iterator_traits<T>::value_type>>;
1432} // namespace detail
1433
1434// Provide wrappers to std::sort which shuffle the elements before sorting
1435// to help uncover non-deterministic behavior (PR35135).
1436template <typename IteratorTy,
1437 std::enable_if_t<!detail::sort_trivially_copyable<IteratorTy>::value,
1438 int> = 0>
1439inline void sort(IteratorTy Start, IteratorTy End) {
1440#ifdef EXPENSIVE_CHECKS
1441 detail::presortShuffle<IteratorTy>(Start, End);
1442#endif
1443 std::sort(Start, End);
1444}
1445
1446// Forward trivially copyable types to array_pod_sort. This avoids a large
1447// amount of code bloat for a minor performance hit.
1448template <typename IteratorTy,
1449 std::enable_if_t<detail::sort_trivially_copyable<IteratorTy>::value,
1450 int> = 0>
1451inline void sort(IteratorTy Start, IteratorTy End) {
1452 array_pod_sort(Start, End);
1453}
1454
1455template <typename Container> inline void sort(Container &&C) {
1456 llvm::sort(adl_begin(C), adl_end(C));
1457}
1458
1459template <typename IteratorTy, typename Compare>
1460inline void sort(IteratorTy Start, IteratorTy End, Compare Comp) {
1461#ifdef EXPENSIVE_CHECKS
1462 detail::presortShuffle<IteratorTy>(Start, End);
1463#endif
1464 std::sort(Start, End, Comp);
1465}
1466
1467template <typename Container, typename Compare>
1468inline void sort(Container &&C, Compare Comp) {
1469 llvm::sort(adl_begin(C), adl_end(C), Comp);
1470}
1471
1472//===----------------------------------------------------------------------===//
1473// Extra additions to <algorithm>
1474//===----------------------------------------------------------------------===//
1475
1476/// Get the size of a range. This is a wrapper function around std::distance
1477/// which is only enabled when the operation is O(1).
1478template <typename R>
1479auto size(R &&Range,
1480 std::enable_if_t<
1481 std::is_base_of<std::random_access_iterator_tag,
1482 typename std::iterator_traits<decltype(
1483 Range.begin())>::iterator_category>::value,
1484 void> * = nullptr) {
1485 return std::distance(Range.begin(), Range.end());
1486}
1487
1488/// Provide wrappers to std::for_each which take ranges instead of having to
1489/// pass begin/end explicitly.
1490template <typename R, typename UnaryFunction>
1491UnaryFunction for_each(R &&Range, UnaryFunction F) {
1492 return std::for_each(adl_begin(Range), adl_end(Range), F);
1493}
1494
1495/// Provide wrappers to std::all_of which take ranges instead of having to pass
1496/// begin/end explicitly.
1497template <typename R, typename UnaryPredicate>
1498bool all_of(R &&Range, UnaryPredicate P) {
1499 return std::all_of(adl_begin(Range), adl_end(Range), P);
1500}
1501
1502/// Provide wrappers to std::any_of which take ranges instead of having to pass
1503/// begin/end explicitly.
1504template <typename R, typename UnaryPredicate>
1505bool any_of(R &&Range, UnaryPredicate P) {
1506 return std::any_of(adl_begin(Range), adl_end(Range), P);
44
Calling 'any_of<llvm::SDValue *, (lambda at /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp:36152:30)>'
49
Returning from 'any_of<llvm::SDValue *, (lambda at /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp:36152:30)>'
50
Returning zero, which participates in a condition later
1507}
1508
1509/// Provide wrappers to std::none_of which take ranges instead of having to pass
1510/// begin/end explicitly.
1511template <typename R, typename UnaryPredicate>
1512bool none_of(R &&Range, UnaryPredicate P) {
1513 return std::none_of(adl_begin(Range), adl_end(Range), P);
1514}
1515
1516/// Provide wrappers to std::find which take ranges instead of having to pass
1517/// begin/end explicitly.
1518template <typename R, typename T> auto find(R &&Range, const T &Val) {
1519 return std::find(adl_begin(Range), adl_end(Range), Val);
1520}
1521
1522/// Provide wrappers to std::find_if which take ranges instead of having to pass
1523/// begin/end explicitly.
1524template <typename R, typename UnaryPredicate>
1525auto find_if(R &&Range, UnaryPredicate P) {
1526 return std::find_if(adl_begin(Range), adl_end(Range), P);
1527}
1528
1529template <typename R, typename UnaryPredicate>
1530auto find_if_not(R &&Range, UnaryPredicate P) {
1531 return std::find_if_not(adl_begin(Range), adl_end(Range), P);
1532}
1533
1534/// Provide wrappers to std::remove_if which take ranges instead of having to
1535/// pass begin/end explicitly.
1536template <typename R, typename UnaryPredicate>
1537auto remove_if(R &&Range, UnaryPredicate P) {
1538 return std::remove_if(adl_begin(Range), adl_end(Range), P);
1539}
1540
1541/// Provide wrappers to std::copy_if which take ranges instead of having to
1542/// pass begin/end explicitly.
1543template <typename R, typename OutputIt, typename UnaryPredicate>
1544OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P) {
1545 return std::copy_if(adl_begin(Range), adl_end(Range), Out, P);
1546}
1547
1548template <typename R, typename OutputIt>
1549OutputIt copy(R &&Range, OutputIt Out) {
1550 return std::copy(adl_begin(Range), adl_end(Range), Out);
1551}
1552
1553/// Provide wrappers to std::move which take ranges instead of having to
1554/// pass begin/end explicitly.
1555template <typename R, typename OutputIt>
1556OutputIt move(R &&Range, OutputIt Out) {
1557 return std::move(adl_begin(Range), adl_end(Range), Out);
1558}
1559
1560/// Wrapper function around std::find to detect if an element exists
1561/// in a container.
1562template <typename R, typename E>
1563bool is_contained(R &&Range, const E &Element) {
1564 return std::find(adl_begin(Range), adl_end(Range), Element) != adl_end(Range);
1565}
1566
1567/// Wrapper function around std::is_sorted to check if elements in a range \p R
1568/// are sorted with respect to a comparator \p C.
1569template <typename R, typename Compare> bool is_sorted(R &&Range, Compare C) {
1570 return std::is_sorted(adl_begin(Range), adl_end(Range), C);
1571}
1572
1573/// Wrapper function around std::is_sorted to check if elements in a range \p R
1574/// are sorted in non-descending order.
1575template <typename R> bool is_sorted(R &&Range) {
1576 return std::is_sorted(adl_begin(Range), adl_end(Range));
1577}
1578
1579/// Wrapper function around std::count to count the number of times an element
1580/// \p Element occurs in the given range \p Range.
1581template <typename R, typename E> auto count(R &&Range, const E &Element) {
1582 return std::count(adl_begin(Range), adl_end(Range), Element);
1583}
1584
1585/// Wrapper function around std::count_if to count the number of times an
1586/// element satisfying a given predicate occurs in a range.
1587template <typename R, typename UnaryPredicate>
1588auto count_if(R &&Range, UnaryPredicate P) {
1589 return std::count_if(adl_begin(Range), adl_end(Range), P);
1590}
1591
1592/// Wrapper function around std::transform to apply a function to a range and
1593/// store the result elsewhere.
1594template <typename R, typename OutputIt, typename UnaryFunction>
1595OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F) {
1596 return std::transform(adl_begin(Range), adl_end(Range), d_first, F);
1597}
1598
1599/// Provide wrappers to std::partition which take ranges instead of having to
1600/// pass begin/end explicitly.
1601template <typename R, typename UnaryPredicate>
1602auto partition(R &&Range, UnaryPredicate P) {
1603 return std::partition(adl_begin(Range), adl_end(Range), P);
1604}
1605
1606/// Provide wrappers to std::lower_bound which take ranges instead of having to
1607/// pass begin/end explicitly.
1608template <typename R, typename T> auto lower_bound(R &&Range, T &&Value) {
1609 return std::lower_bound(adl_begin(Range), adl_end(Range),
1610 std::forward<T>(Value));
1611}
1612
1613template <typename R, typename T, typename Compare>
1614auto lower_bound(R &&Range, T &&Value, Compare C) {
1615 return std::lower_bound(adl_begin(Range), adl_end(Range),
1616 std::forward<T>(Value), C);
1617}
1618
1619/// Provide wrappers to std::upper_bound which take ranges instead of having to
1620/// pass begin/end explicitly.
1621template <typename R, typename T> auto upper_bound(R &&Range, T &&Value) {
1622 return std::upper_bound(adl_begin(Range), adl_end(Range),
1623 std::forward<T>(Value));
1624}
1625
1626template <typename R, typename T, typename Compare>
1627auto upper_bound(R &&Range, T &&Value, Compare C) {
1628 return std::upper_bound(adl_begin(Range), adl_end(Range),
1629 std::forward<T>(Value), C);
1630}
1631
1632template <typename R>
1633void stable_sort(R &&Range) {
1634 std::stable_sort(adl_begin(Range), adl_end(Range));
1635}
1636
1637template <typename R, typename Compare>
1638void stable_sort(R &&Range, Compare C) {
1639 std::stable_sort(adl_begin(Range), adl_end(Range), C);
1640}
1641
1642/// Binary search for the first iterator in a range where a predicate is false.
1643/// Requires that C is always true below some limit, and always false above it.
1644template <typename R, typename Predicate,
1645 typename Val = decltype(*adl_begin(std::declval<R>()))>
1646auto partition_point(R &&Range, Predicate P) {
1647 return std::partition_point(adl_begin(Range), adl_end(Range), P);
1648}
1649
1650/// Wrapper function around std::equal to detect if all elements
1651/// in a container are same.
1652template <typename R>
1653bool is_splat(R &&Range) {
1654 size_t range_size = size(Range);
1655 return range_size != 0 && (range_size == 1 ||
1656 std::equal(adl_begin(Range) + 1, adl_end(Range), adl_begin(Range)));
1657}
1658
1659/// Provide a container algorithm similar to C++ Library Fundamentals v2's
1660/// `erase_if` which is equivalent to:
1661///
1662/// C.erase(remove_if(C, pred), C.end());
1663///
1664/// This version works for any container with an erase method call accepting
1665/// two iterators.
1666template <typename Container, typename UnaryPredicate>
1667void erase_if(Container &C, UnaryPredicate P) {
1668 C.erase(remove_if(C, P), C.end());
1669}
1670
1671/// Wrapper function to remove a value from a container:
1672///
1673/// C.erase(remove(C.begin(), C.end(), V), C.end());
1674template <typename Container, typename ValueType>
1675void erase_value(Container &C, ValueType V) {
1676 C.erase(std::remove(C.begin(), C.end(), V), C.end());
1677}
1678
1679/// Wrapper function to append a range to a container.
1680///
1681/// C.insert(C.end(), R.begin(), R.end());
1682template <typename Container, typename Range>
1683inline void append_range(Container &C, Range &&R) {
1684 C.insert(C.end(), R.begin(), R.end());
1685}
1686
1687/// Given a sequence container Cont, replace the range [ContIt, ContEnd) with
1688/// the range [ValIt, ValEnd) (which is not from the same container).
1689template<typename Container, typename RandomAccessIterator>
1690void replace(Container &Cont, typename Container::iterator ContIt,
1691 typename Container::iterator ContEnd, RandomAccessIterator ValIt,
1692 RandomAccessIterator ValEnd) {
1693 while (true) {
1694 if (ValIt == ValEnd) {
1695 Cont.erase(ContIt, ContEnd);
1696 return;
1697 } else if (ContIt == ContEnd) {
1698 Cont.insert(ContIt, ValIt, ValEnd);
1699 return;
1700 }
1701 *ContIt++ = *ValIt++;
1702 }
1703}
1704
1705/// Given a sequence container Cont, replace the range [ContIt, ContEnd) with
1706/// the range R.
1707template<typename Container, typename Range = std::initializer_list<
1708 typename Container::value_type>>
1709void replace(Container &Cont, typename Container::iterator ContIt,
1710 typename Container::iterator ContEnd, Range R) {
1711 replace(Cont, ContIt, ContEnd, R.begin(), R.end());
1712}
1713
1714/// An STL-style algorithm similar to std::for_each that applies a second
1715/// functor between every pair of elements.
1716///
1717/// This provides the control flow logic to, for example, print a
1718/// comma-separated list:
1719/// \code
1720/// interleave(names.begin(), names.end(),
1721/// [&](StringRef name) { os << name; },
1722/// [&] { os << ", "; });
1723/// \endcode
1724template <typename ForwardIterator, typename UnaryFunctor,
1725 typename NullaryFunctor,
1726 typename = typename std::enable_if<
1727 !std::is_constructible<StringRef, UnaryFunctor>::value &&
1728 !std::is_constructible<StringRef, NullaryFunctor>::value>::type>
1729inline void interleave(ForwardIterator begin, ForwardIterator end,
1730 UnaryFunctor each_fn, NullaryFunctor between_fn) {
1731 if (begin == end)
1732 return;
1733 each_fn(*begin);
1734 ++begin;
1735 for (; begin != end; ++begin) {
1736 between_fn();
1737 each_fn(*begin);
1738 }
1739}
1740
1741template <typename Container, typename UnaryFunctor, typename NullaryFunctor,
1742 typename = typename std::enable_if<
1743 !std::is_constructible<StringRef, UnaryFunctor>::value &&
1744 !std::is_constructible<StringRef, NullaryFunctor>::value>::type>
1745inline void interleave(const Container &c, UnaryFunctor each_fn,
1746 NullaryFunctor between_fn) {
1747 interleave(c.begin(), c.end(), each_fn, between_fn);
1748}
1749
1750/// Overload of interleave for the common case of string separator.
1751template <typename Container, typename UnaryFunctor, typename StreamT,
1752 typename T = detail::ValueOfRange<Container>>
1753inline void interleave(const Container &c, StreamT &os, UnaryFunctor each_fn,
1754 const StringRef &separator) {
1755 interleave(c.begin(), c.end(), each_fn, [&] { os << separator; });
1756}
1757template <typename Container, typename StreamT,
1758 typename T = detail::ValueOfRange<Container>>
1759inline void interleave(const Container &c, StreamT &os,
1760 const StringRef &separator) {
1761 interleave(
1762 c, os, [&](const T &a) { os << a; }, separator);
1763}
1764
1765template <typename Container, typename UnaryFunctor, typename StreamT,
1766 typename T = detail::ValueOfRange<Container>>
1767inline void interleaveComma(const Container &c, StreamT &os,
1768 UnaryFunctor each_fn) {
1769 interleave(c, os, each_fn, ", ");
1770}
1771template <typename Container, typename StreamT,
1772 typename T = detail::ValueOfRange<Container>>
1773inline void interleaveComma(const Container &c, StreamT &os) {
1774 interleaveComma(c, os, [&](const T &a) { os << a; });
1775}
1776
1777//===----------------------------------------------------------------------===//
1778// Extra additions to <memory>
1779//===----------------------------------------------------------------------===//
1780
1781struct FreeDeleter {
1782 void operator()(void* v) {
1783 ::free(v);
1784 }
1785};
1786
1787template<typename First, typename Second>
1788struct pair_hash {
1789 size_t operator()(const std::pair<First, Second> &P) const {
1790 return std::hash<First>()(P.first) * 31 + std::hash<Second>()(P.second);
1791 }
1792};
1793
1794/// Binary functor that adapts to any other binary functor after dereferencing
1795/// operands.
1796template <typename T> struct deref {
1797 T func;
1798
1799 // Could be further improved to cope with non-derivable functors and
1800 // non-binary functors (should be a variadic template member function
1801 // operator()).
1802 template <typename A, typename B> auto operator()(A &lhs, B &rhs) const {
1803 assert(lhs)((lhs) ? static_cast<void> (0) : __assert_fail ("lhs", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1803, __PRETTY_FUNCTION__))
;
1804 assert(rhs)((rhs) ? static_cast<void> (0) : __assert_fail ("rhs", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1804, __PRETTY_FUNCTION__))
;
1805 return func(*lhs, *rhs);
1806 }
1807};
1808
1809namespace detail {
1810
1811template <typename R> class enumerator_iter;
1812
1813template <typename R> struct result_pair {
1814 using value_reference =
1815 typename std::iterator_traits<IterOfRange<R>>::reference;
1816
1817 friend class enumerator_iter<R>;
1818
1819 result_pair() = default;
1820 result_pair(std::size_t Index, IterOfRange<R> Iter)
1821 : Index(Index), Iter(Iter) {}
1822
1823 result_pair<R>(const result_pair<R> &Other)
1824 : Index(Other.Index), Iter(Other.Iter) {}
1825 result_pair<R> &operator=(const result_pair<R> &Other) {
1826 Index = Other.Index;
1827 Iter = Other.Iter;
1828 return *this;
1829 }
1830
1831 std::size_t index() const { return Index; }
1832 const value_reference value() const { return *Iter; }
1833 value_reference value() { return *Iter; }
1834
1835private:
1836 std::size_t Index = std::numeric_limits<std::size_t>::max();
1837 IterOfRange<R> Iter;
1838};
1839
1840template <typename R>
1841class enumerator_iter
1842 : public iterator_facade_base<
1843 enumerator_iter<R>, std::forward_iterator_tag, result_pair<R>,
1844 typename std::iterator_traits<IterOfRange<R>>::difference_type,
1845 typename std::iterator_traits<IterOfRange<R>>::pointer,
1846 typename std::iterator_traits<IterOfRange<R>>::reference> {
1847 using result_type = result_pair<R>;
1848
1849public:
1850 explicit enumerator_iter(IterOfRange<R> EndIter)
1851 : Result(std::numeric_limits<size_t>::max(), EndIter) {}
1852
1853 enumerator_iter(std::size_t Index, IterOfRange<R> Iter)
1854 : Result(Index, Iter) {}
1855
1856 result_type &operator*() { return Result; }
1857 const result_type &operator*() const { return Result; }
1858
1859 enumerator_iter<R> &operator++() {
1860 assert(Result.Index != std::numeric_limits<size_t>::max())((Result.Index != std::numeric_limits<size_t>::max()) ?
static_cast<void> (0) : __assert_fail ("Result.Index != std::numeric_limits<size_t>::max()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1860, __PRETTY_FUNCTION__))
;
1861 ++Result.Iter;
1862 ++Result.Index;
1863 return *this;
1864 }
1865
1866 bool operator==(const enumerator_iter<R> &RHS) const {
1867 // Don't compare indices here, only iterators. It's possible for an end
1868 // iterator to have different indices depending on whether it was created
1869 // by calling std::end() versus incrementing a valid iterator.
1870 return Result.Iter == RHS.Result.Iter;
1871 }
1872
1873 enumerator_iter<R>(const enumerator_iter<R> &Other) : Result(Other.Result) {}
1874 enumerator_iter<R> &operator=(const enumerator_iter<R> &Other) {
1875 Result = Other.Result;
1876 return *this;
1877 }
1878
1879private:
1880 result_type Result;
1881};
1882
1883template <typename R> class enumerator {
1884public:
1885 explicit enumerator(R &&Range) : TheRange(std::forward<R>(Range)) {}
1886
1887 enumerator_iter<R> begin() {
1888 return enumerator_iter<R>(0, std::begin(TheRange));
1889 }
1890
1891 enumerator_iter<R> end() {
1892 return enumerator_iter<R>(std::end(TheRange));
1893 }
1894
1895private:
1896 R TheRange;
1897};
1898
1899} // end namespace detail
1900
1901/// Given an input range, returns a new range whose values are are pair (A,B)
1902/// such that A is the 0-based index of the item in the sequence, and B is
1903/// the value from the original sequence. Example:
1904///
1905/// std::vector<char> Items = {'A', 'B', 'C', 'D'};
1906/// for (auto X : enumerate(Items)) {
1907/// printf("Item %d - %c\n", X.index(), X.value());
1908/// }
1909///
1910/// Output:
1911/// Item 0 - A
1912/// Item 1 - B
1913/// Item 2 - C
1914/// Item 3 - D
1915///
1916template <typename R> detail::enumerator<R> enumerate(R &&TheRange) {
1917 return detail::enumerator<R>(std::forward<R>(TheRange));
1918}
1919
1920namespace detail {
1921
1922template <typename F, typename Tuple, std::size_t... I>
1923decltype(auto) apply_tuple_impl(F &&f, Tuple &&t, std::index_sequence<I...>) {
1924 return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
1925}
1926
1927} // end namespace detail
1928
1929/// Given an input tuple (a1, a2, ..., an), pass the arguments of the
1930/// tuple variadically to f as if by calling f(a1, a2, ..., an) and
1931/// return the result.
1932template <typename F, typename Tuple>
1933decltype(auto) apply_tuple(F &&f, Tuple &&t) {
1934 using Indices = std::make_index_sequence<
1935 std::tuple_size<typename std::decay<Tuple>::type>::value>;
1936
1937 return detail::apply_tuple_impl(std::forward<F>(f), std::forward<Tuple>(t),
1938 Indices{});
1939}
1940
1941/// Return true if the sequence [Begin, End) has exactly N items. Runs in O(N)
1942/// time. Not meant for use with random-access iterators.
1943/// Can optionally take a predicate to filter lazily some items.
1944template <typename IterTy,
1945 typename Pred = bool (*)(const decltype(*std::declval<IterTy>()) &)>
1946bool hasNItems(
1947 IterTy &&Begin, IterTy &&End, unsigned N,
1948 Pred &&ShouldBeCounted =
1949 [](const decltype(*std::declval<IterTy>()) &) { return true; },
1950 std::enable_if_t<
1951 !std::is_base_of<std::random_access_iterator_tag,
1952 typename std::iterator_traits<std::remove_reference_t<
1953 decltype(Begin)>>::iterator_category>::value,
1954 void> * = nullptr) {
1955 for (; N; ++Begin) {
1956 if (Begin == End)
1957 return false; // Too few.
1958 N -= ShouldBeCounted(*Begin);
1959 }
1960 for (; Begin != End; ++Begin)
1961 if (ShouldBeCounted(*Begin))
1962 return false; // Too many.
1963 return true;
1964}
1965
1966/// Return true if the sequence [Begin, End) has N or more items. Runs in O(N)
1967/// time. Not meant for use with random-access iterators.
1968/// Can optionally take a predicate to lazily filter some items.
1969template <typename IterTy,
1970 typename Pred = bool (*)(const decltype(*std::declval<IterTy>()) &)>
1971bool hasNItemsOrMore(
1972 IterTy &&Begin, IterTy &&End, unsigned N,
1973 Pred &&ShouldBeCounted =
1974 [](const decltype(*std::declval<IterTy>()) &) { return true; },
1975 std::enable_if_t<
1976 !std::is_base_of<std::random_access_iterator_tag,
1977 typename std::iterator_traits<std::remove_reference_t<
1978 decltype(Begin)>>::iterator_category>::value,
1979 void> * = nullptr) {
1980 for (; N; ++Begin) {
1981 if (Begin == End)
1982 return false; // Too few.
1983 N -= ShouldBeCounted(*Begin);
1984 }
1985 return true;
1986}
1987
1988/// Returns true if the sequence [Begin, End) has N or less items. Can
1989/// optionally take a predicate to lazily filter some items.
1990template <typename IterTy,
1991 typename Pred = bool (*)(const decltype(*std::declval<IterTy>()) &)>
1992bool hasNItemsOrLess(
1993 IterTy &&Begin, IterTy &&End, unsigned N,
1994 Pred &&ShouldBeCounted = [](const decltype(*std::declval<IterTy>()) &) {
1995 return true;
1996 }) {
1997 assert(N != std::numeric_limits<unsigned>::max())((N != std::numeric_limits<unsigned>::max()) ? static_cast
<void> (0) : __assert_fail ("N != std::numeric_limits<unsigned>::max()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/STLExtras.h"
, 1997, __PRETTY_FUNCTION__))
;
1998 return !hasNItemsOrMore(Begin, End, N + 1, ShouldBeCounted);
1999}
2000
2001/// Returns true if the given container has exactly N items
2002template <typename ContainerTy> bool hasNItems(ContainerTy &&C, unsigned N) {
2003 return hasNItems(std::begin(C), std::end(C), N);
2004}
2005
2006/// Returns true if the given container has N or more items
2007template <typename ContainerTy>
2008bool hasNItemsOrMore(ContainerTy &&C, unsigned N) {
2009 return hasNItemsOrMore(std::begin(C), std::end(C), N);
2010}
2011
2012/// Returns true if the given container has N or less items
2013template <typename ContainerTy>
2014bool hasNItemsOrLess(ContainerTy &&C, unsigned N) {
2015 return hasNItemsOrLess(std::begin(C), std::end(C), N);
2016}
2017
2018/// Returns a raw pointer that represents the same address as the argument.
2019///
2020/// This implementation can be removed once we move to C++20 where it's defined
2021/// as std::to_address().
2022///
2023/// The std::pointer_traits<>::to_address(p) variations of these overloads has
2024/// not been implemented.
2025template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); }
2026template <class T> constexpr T *to_address(T *P) { return P; }
2027
2028} // end namespace llvm
2029
2030#endif // LLVM_ADT_STLEXTRAS_H

/usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/bits/stl_algo.h

1// Algorithm implementation -*- C++ -*-
2
3// Copyright (C) 2001-2016 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25/*
26 *
27 * Copyright (c) 1994
28 * Hewlett-Packard Company
29 *
30 * Permission to use, copy, modify, distribute and sell this software
31 * and its documentation for any purpose is hereby granted without fee,
32 * provided that the above copyright notice appear in all copies and
33 * that both that copyright notice and this permission notice appear
34 * in supporting documentation. Hewlett-Packard Company makes no
35 * representations about the suitability of this software for any
36 * purpose. It is provided "as is" without express or implied warranty.
37 *
38 *
39 * Copyright (c) 1996
40 * Silicon Graphics Computer Systems, Inc.
41 *
42 * Permission to use, copy, modify, distribute and sell this software
43 * and its documentation for any purpose is hereby granted without fee,
44 * provided that the above copyright notice appear in all copies and
45 * that both that copyright notice and this permission notice appear
46 * in supporting documentation. Silicon Graphics makes no
47 * representations about the suitability of this software for any
48 * purpose. It is provided "as is" without express or implied warranty.
49 */
50
51/** @file bits/stl_algo.h
52 * This is an internal header file, included by other library headers.
53 * Do not attempt to use it directly. @headername{algorithm}
54 */
55
56#ifndef _STL_ALGO_H1
57#define _STL_ALGO_H1 1
58
59#include <cstdlib> // for rand
60#include <bits/algorithmfwd.h>
61#include <bits/stl_heap.h>
62#include <bits/stl_tempbuf.h> // for _Temporary_buffer
63#include <bits/predefined_ops.h>
64
65#if __cplusplus201402L >= 201103L
66#include <bits/uniform_int_dist.h>
67#endif
68
69// See concept_check.h for the __glibcxx_*_requires macros.
70
71namespace std _GLIBCXX_VISIBILITY(default)__attribute__ ((__visibility__ ("default")))
72{
73_GLIBCXX_BEGIN_NAMESPACE_VERSION
74
75 /// Swaps the median value of *__a, *__b and *__c under __comp to *__result
76 template<typename _Iterator, typename _Compare>
77 void
78 __move_median_to_first(_Iterator __result,_Iterator __a, _Iterator __b,
79 _Iterator __c, _Compare __comp)
80 {
81 if (__comp(__a, __b))
82 {
83 if (__comp(__b, __c))
84 std::iter_swap(__result, __b);
85 else if (__comp(__a, __c))
86 std::iter_swap(__result, __c);
87 else
88 std::iter_swap(__result, __a);
89 }
90 else if (__comp(__a, __c))
91 std::iter_swap(__result, __a);
92 else if (__comp(__b, __c))
93 std::iter_swap(__result, __c);
94 else
95 std::iter_swap(__result, __b);
96 }
97
98 /// This is an overload used by find algos for the Input Iterator case.
99 template<typename _InputIterator, typename _Predicate>
100 inline _InputIterator
101 __find_if(_InputIterator __first, _InputIterator __last,
102 _Predicate __pred, input_iterator_tag)
103 {
104 while (__first != __last && !__pred(__first))
105 ++__first;
106 return __first;
107 }
108
109 /// This is an overload used by find algos for the RAI case.
110 template<typename _RandomAccessIterator, typename _Predicate>
111 _RandomAccessIterator
112 __find_if(_RandomAccessIterator __first, _RandomAccessIterator __last,
113 _Predicate __pred, random_access_iterator_tag)
114 {
115 typename iterator_traits<_RandomAccessIterator>::difference_type
116 __trip_count = (__last - __first) >> 2;
117
118 for (; __trip_count > 0; --__trip_count)
119 {
120 if (__pred(__first))
121 return __first;
122 ++__first;
123
124 if (__pred(__first))
125 return __first;
126 ++__first;
127
128 if (__pred(__first))
129 return __first;
130 ++__first;
131
132 if (__pred(__first))
133 return __first;
134 ++__first;
135 }
136
137 switch (__last - __first)
138 {
139 case 3:
140 if (__pred(__first))
141 return __first;
142 ++__first;
143 case 2:
144 if (__pred(__first))
145 return __first;
146 ++__first;
147 case 1:
148 if (__pred(__first))
149 return __first;
150 ++__first;
151 case 0:
152 default:
153 return __last;
154 }
155 }
156
157 template<typename _Iterator, typename _Predicate>
158 inline _Iterator
159 __find_if(_Iterator __first, _Iterator __last, _Predicate __pred)
160 {
161 return __find_if(__first, __last, __pred,
162 std::__iterator_category(__first));
163 }
164
165 /// Provided for stable_partition to use.
166 template<typename _InputIterator, typename _Predicate>
167 inline _InputIterator
168 __find_if_not(_InputIterator __first, _InputIterator __last,
169 _Predicate __pred)
170 {
171 return std::__find_if(__first, __last,
172 __gnu_cxx::__ops::__negate(__pred),
173 std::__iterator_category(__first));
174 }
175
176 /// Like find_if_not(), but uses and updates a count of the
177 /// remaining range length instead of comparing against an end
178 /// iterator.
179 template<typename _InputIterator, typename _Predicate, typename _Distance>
180 _InputIterator
181 __find_if_not_n(_InputIterator __first, _Distance& __len, _Predicate __pred)
182 {
183 for (; __len; --__len, ++__first)
184 if (!__pred(__first))
185 break;
186 return __first;
187 }
188
189 // set_difference
190 // set_intersection
191 // set_symmetric_difference
192 // set_union
193 // for_each
194 // find
195 // find_if
196 // find_first_of
197 // adjacent_find
198 // count
199 // count_if
200 // search
201
202 template<typename _ForwardIterator1, typename _ForwardIterator2,
203 typename _BinaryPredicate>
204 _ForwardIterator1
205 __search(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
206 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
207 _BinaryPredicate __predicate)
208 {
209 // Test for empty ranges
210 if (__first1 == __last1 || __first2 == __last2)
211 return __first1;
212
213 // Test for a pattern of length 1.
214 _ForwardIterator2 __p1(__first2);
215 if (++__p1 == __last2)
216 return std::__find_if(__first1, __last1,
217 __gnu_cxx::__ops::__iter_comp_iter(__predicate, __first2));
218
219 // General case.
220 _ForwardIterator2 __p;
221 _ForwardIterator1 __current = __first1;
222
223 for (;;)
224 {
225 __first1 =
226 std::__find_if(__first1, __last1,
227 __gnu_cxx::__ops::__iter_comp_iter(__predicate, __first2));
228
229 if (__first1 == __last1)
230 return __last1;
231
232 __p = __p1;
233 __current = __first1;
234 if (++__current == __last1)
235 return __last1;
236
237 while (__predicate(__current, __p))
238 {
239 if (++__p == __last2)
240 return __first1;
241 if (++__current == __last1)
242 return __last1;
243 }
244 ++__first1;
245 }
246 return __first1;
247 }
248
249 // search_n
250
251 /**
252 * This is an helper function for search_n overloaded for forward iterators.
253 */
254 template<typename _ForwardIterator, typename _Integer,
255 typename _UnaryPredicate>
256 _ForwardIterator
257 __search_n_aux(_ForwardIterator __first, _ForwardIterator __last,
258 _Integer __count, _UnaryPredicate __unary_pred,
259 std::forward_iterator_tag)
260 {
261 __first = std::__find_if(__first, __last, __unary_pred);
262 while (__first != __last)
263 {
264 typename iterator_traits<_ForwardIterator>::difference_type
265 __n = __count;
266 _ForwardIterator __i = __first;
267 ++__i;
268 while (__i != __last && __n != 1 && __unary_pred(__i))
269 {
270 ++__i;
271 --__n;
272 }
273 if (__n == 1)
274 return __first;
275 if (__i == __last)
276 return __last;
277 __first = std::__find_if(++__i, __last, __unary_pred);
278 }
279 return __last;
280 }
281
282 /**
283 * This is an helper function for search_n overloaded for random access
284 * iterators.
285 */
286 template<typename _RandomAccessIter, typename _Integer,
287 typename _UnaryPredicate>
288 _RandomAccessIter
289 __search_n_aux(_RandomAccessIter __first, _RandomAccessIter __last,
290 _Integer __count, _UnaryPredicate __unary_pred,
291 std::random_access_iterator_tag)
292 {
293 typedef typename std::iterator_traits<_RandomAccessIter>::difference_type
294 _DistanceType;
295
296 _DistanceType __tailSize = __last - __first;
297 _DistanceType __remainder = __count;
298
299 while (__remainder <= __tailSize) // the main loop...
300 {
301 __first += __remainder;
302 __tailSize -= __remainder;
303 // __first here is always pointing to one past the last element of
304 // next possible match.
305 _RandomAccessIter __backTrack = __first;
306 while (__unary_pred(--__backTrack))
307 {
308 if (--__remainder == 0)
309 return (__first - __count); // Success
310 }
311 __remainder = __count + 1 - (__first - __backTrack);
312 }
313 return __last; // Failure
314 }
315
316 template<typename _ForwardIterator, typename _Integer,
317 typename _UnaryPredicate>
318 _ForwardIterator
319 __search_n(_ForwardIterator __first, _ForwardIterator __last,
320 _Integer __count,
321 _UnaryPredicate __unary_pred)
322 {
323 if (__count <= 0)
324 return __first;
325
326 if (__count == 1)
327 return std::__find_if(__first, __last, __unary_pred);
328
329 return std::__search_n_aux(__first, __last, __count, __unary_pred,
330 std::__iterator_category(__first));
331 }
332
333 // find_end for forward iterators.
334 template<typename _ForwardIterator1, typename _ForwardIterator2,
335 typename _BinaryPredicate>
336 _ForwardIterator1
337 __find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
338 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
339 forward_iterator_tag, forward_iterator_tag,
340 _BinaryPredicate __comp)
341 {
342 if (__first2 == __last2)
343 return __last1;
344
345 _ForwardIterator1 __result = __last1;
346 while (1)
347 {
348 _ForwardIterator1 __new_result
349 = std::__search(__first1, __last1, __first2, __last2, __comp);
350 if (__new_result == __last1)
351 return __result;
352 else
353 {
354 __result = __new_result;
355 __first1 = __new_result;
356 ++__first1;
357 }
358 }
359 }
360
361 // find_end for bidirectional iterators (much faster).
362 template<typename _BidirectionalIterator1, typename _BidirectionalIterator2,
363 typename _BinaryPredicate>
364 _BidirectionalIterator1
365 __find_end(_BidirectionalIterator1 __first1,
366 _BidirectionalIterator1 __last1,
367 _BidirectionalIterator2 __first2,
368 _BidirectionalIterator2 __last2,
369 bidirectional_iterator_tag, bidirectional_iterator_tag,
370 _BinaryPredicate __comp)
371 {
372 // concept requirements
373 __glibcxx_function_requires(_BidirectionalIteratorConcept<
374 _BidirectionalIterator1>)
375 __glibcxx_function_requires(_BidirectionalIteratorConcept<
376 _BidirectionalIterator2>)
377
378 typedef reverse_iterator<_BidirectionalIterator1> _RevIterator1;
379 typedef reverse_iterator<_BidirectionalIterator2> _RevIterator2;
380
381 _RevIterator1 __rlast1(__first1);
382 _RevIterator2 __rlast2(__first2);
383 _RevIterator1 __rresult = std::__search(_RevIterator1(__last1), __rlast1,
384 _RevIterator2(__last2), __rlast2,
385 __comp);
386
387 if (__rresult == __rlast1)
388 return __last1;
389 else
390 {
391 _BidirectionalIterator1 __result = __rresult.base();
392 std::advance(__result, -std::distance(__first2, __last2));
393 return __result;
394 }
395 }
396
397 /**
398 * @brief Find last matching subsequence in a sequence.
399 * @ingroup non_mutating_algorithms
400 * @param __first1 Start of range to search.
401 * @param __last1 End of range to search.
402 * @param __first2 Start of sequence to match.
403 * @param __last2 End of sequence to match.
404 * @return The last iterator @c i in the range
405 * @p [__first1,__last1-(__last2-__first2)) such that @c *(i+N) ==
406 * @p *(__first2+N) for each @c N in the range @p
407 * [0,__last2-__first2), or @p __last1 if no such iterator exists.
408 *
409 * Searches the range @p [__first1,__last1) for a sub-sequence that
410 * compares equal value-by-value with the sequence given by @p
411 * [__first2,__last2) and returns an iterator to the __first
412 * element of the sub-sequence, or @p __last1 if the sub-sequence
413 * is not found. The sub-sequence will be the last such
414 * subsequence contained in [__first1,__last1).
415 *
416 * Because the sub-sequence must lie completely within the range @p
417 * [__first1,__last1) it must start at a position less than @p
418 * __last1-(__last2-__first2) where @p __last2-__first2 is the
419 * length of the sub-sequence. This means that the returned
420 * iterator @c i will be in the range @p
421 * [__first1,__last1-(__last2-__first2))
422 */
423 template<typename _ForwardIterator1, typename _ForwardIterator2>
424 inline _ForwardIterator1
425 find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
426 _ForwardIterator2 __first2, _ForwardIterator2 __last2)
427 {
428 // concept requirements
429 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
430 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
431 __glibcxx_function_requires(_EqualOpConcept<
432 typename iterator_traits<_ForwardIterator1>::value_type,
433 typename iterator_traits<_ForwardIterator2>::value_type>)
434 __glibcxx_requires_valid_range(__first1, __last1);
435 __glibcxx_requires_valid_range(__first2, __last2);
436
437 return std::__find_end(__first1, __last1, __first2, __last2,
438 std::__iterator_category(__first1),
439 std::__iterator_category(__first2),
440 __gnu_cxx::__ops::__iter_equal_to_iter());
441 }
442
443 /**
444 * @brief Find last matching subsequence in a sequence using a predicate.
445 * @ingroup non_mutating_algorithms
446 * @param __first1 Start of range to search.
447 * @param __last1 End of range to search.
448 * @param __first2 Start of sequence to match.
449 * @param __last2 End of sequence to match.
450 * @param __comp The predicate to use.
451 * @return The last iterator @c i in the range @p
452 * [__first1,__last1-(__last2-__first2)) such that @c
453 * predicate(*(i+N), @p (__first2+N)) is true for each @c N in the
454 * range @p [0,__last2-__first2), or @p __last1 if no such iterator
455 * exists.
456 *
457 * Searches the range @p [__first1,__last1) for a sub-sequence that
458 * compares equal value-by-value with the sequence given by @p
459 * [__first2,__last2) using comp as a predicate and returns an
460 * iterator to the first element of the sub-sequence, or @p __last1
461 * if the sub-sequence is not found. The sub-sequence will be the
462 * last such subsequence contained in [__first,__last1).
463 *
464 * Because the sub-sequence must lie completely within the range @p
465 * [__first1,__last1) it must start at a position less than @p
466 * __last1-(__last2-__first2) where @p __last2-__first2 is the
467 * length of the sub-sequence. This means that the returned
468 * iterator @c i will be in the range @p
469 * [__first1,__last1-(__last2-__first2))
470 */
471 template<typename _ForwardIterator1, typename _ForwardIterator2,
472 typename _BinaryPredicate>
473 inline _ForwardIterator1
474 find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
475 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
476 _BinaryPredicate __comp)
477 {
478 // concept requirements
479 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
480 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
481 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
482 typename iterator_traits<_ForwardIterator1>::value_type,
483 typename iterator_traits<_ForwardIterator2>::value_type>)
484 __glibcxx_requires_valid_range(__first1, __last1);
485 __glibcxx_requires_valid_range(__first2, __last2);
486
487 return std::__find_end(__first1, __last1, __first2, __last2,
488 std::__iterator_category(__first1),
489 std::__iterator_category(__first2),
490 __gnu_cxx::__ops::__iter_comp_iter(__comp));
491 }
492
493#if __cplusplus201402L >= 201103L
494 /**
495 * @brief Checks that a predicate is true for all the elements
496 * of a sequence.
497 * @ingroup non_mutating_algorithms
498 * @param __first An input iterator.
499 * @param __last An input iterator.
500 * @param __pred A predicate.
501 * @return True if the check is true, false otherwise.
502 *
503 * Returns true if @p __pred is true for each element in the range
504 * @p [__first,__last), and false otherwise.
505 */
506 template<typename _InputIterator, typename _Predicate>
507 inline bool
508 all_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
509 { return __last == std::find_if_not(__first, __last, __pred); }
510
511 /**
512 * @brief Checks that a predicate is false for all the elements
513 * of a sequence.
514 * @ingroup non_mutating_algorithms
515 * @param __first An input iterator.
516 * @param __last An input iterator.
517 * @param __pred A predicate.
518 * @return True if the check is true, false otherwise.
519 *
520 * Returns true if @p __pred is false for each element in the range
521 * @p [__first,__last), and false otherwise.
522 */
523 template<typename _InputIterator, typename _Predicate>
524 inline bool
525 none_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
526 { return __last == _GLIBCXX_STD_Astd::find_if(__first, __last, __pred); }
46
Returning the value 1, which participates in a condition later
527
528 /**
529 * @brief Checks that a predicate is false for at least an element
530 * of a sequence.
531 * @ingroup non_mutating_algorithms
532 * @param __first An input iterator.
533 * @param __last An input iterator.
534 * @param __pred A predicate.
535 * @return True if the check is true, false otherwise.
536 *
537 * Returns true if an element exists in the range @p
538 * [__first,__last) such that @p __pred is true, and false
539 * otherwise.
540 */
541 template<typename _InputIterator, typename _Predicate>
542 inline bool
543 any_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
544 { return !std::none_of(__first, __last, __pred); }
45
Calling 'none_of<llvm::SDValue *, (lambda at /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp:36152:30)>'
47
Returning from 'none_of<llvm::SDValue *, (lambda at /build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/lib/Target/X86/X86ISelLowering.cpp:36152:30)>'
48
Returning zero, which participates in a condition later
545
546 /**
547 * @brief Find the first element in a sequence for which a
548 * predicate is false.
549 * @ingroup non_mutating_algorithms
550 * @param __first An input iterator.
551 * @param __last An input iterator.
552 * @param __pred A predicate.
553 * @return The first iterator @c i in the range @p [__first,__last)
554 * such that @p __pred(*i) is false, or @p __last if no such iterator exists.
555 */
556 template<typename _InputIterator, typename _Predicate>
557 inline _InputIterator
558 find_if_not(_InputIterator __first, _InputIterator __last,
559 _Predicate __pred)
560 {
561 // concept requirements
562 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
563 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
564 typename iterator_traits<_InputIterator>::value_type>)
565 __glibcxx_requires_valid_range(__first, __last);
566 return std::__find_if_not(__first, __last,
567 __gnu_cxx::__ops::__pred_iter(__pred));
568 }
569
570 /**
571 * @brief Checks whether the sequence is partitioned.
572 * @ingroup mutating_algorithms
573 * @param __first An input iterator.
574 * @param __last An input iterator.
575 * @param __pred A predicate.
576 * @return True if the range @p [__first,__last) is partioned by @p __pred,
577 * i.e. if all elements that satisfy @p __pred appear before those that
578 * do not.
579 */
580 template<typename _InputIterator, typename _Predicate>
581 inline bool
582 is_partitioned(_InputIterator __first, _InputIterator __last,
583 _Predicate __pred)
584 {
585 __first = std::find_if_not(__first, __last, __pred);
586 return std::none_of(__first, __last, __pred);
587 }
588
589 /**
590 * @brief Find the partition point of a partitioned range.
591 * @ingroup mutating_algorithms
592 * @param __first An iterator.
593 * @param __last Another iterator.
594 * @param __pred A predicate.
595 * @return An iterator @p mid such that @p all_of(__first, mid, __pred)
596 * and @p none_of(mid, __last, __pred) are both true.
597 */
598 template<typename _ForwardIterator, typename _Predicate>
599 _ForwardIterator
600 partition_point(_ForwardIterator __first, _ForwardIterator __last,
601 _Predicate __pred)
602 {
603 // concept requirements
604 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
605 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
606 typename iterator_traits<_ForwardIterator>::value_type>)
607
608 // A specific debug-mode test will be necessary...
609 __glibcxx_requires_valid_range(__first, __last);
610
611 typedef typename iterator_traits<_ForwardIterator>::difference_type
612 _DistanceType;
613
614 _DistanceType __len = std::distance(__first, __last);
615 _DistanceType __half;
616 _ForwardIterator __middle;
617
618 while (__len > 0)
619 {
620 __half = __len >> 1;
621 __middle = __first;
622 std::advance(__middle, __half);
623 if (__pred(*__middle))
624 {
625 __first = __middle;
626 ++__first;
627 __len = __len - __half - 1;
628 }
629 else
630 __len = __half;
631 }
632 return __first;
633 }
634#endif
635
636 template<typename _InputIterator, typename _OutputIterator,
637 typename _Predicate>
638 _OutputIterator
639 __remove_copy_if(_InputIterator __first, _InputIterator __last,
640 _OutputIterator __result, _Predicate __pred)
641 {
642 for (; __first != __last; ++__first)
643 if (!__pred(__first))
644 {
645 *__result = *__first;
646 ++__result;
647 }
648 return __result;
649 }
650
651 /**
652 * @brief Copy a sequence, removing elements of a given value.
653 * @ingroup mutating_algorithms
654 * @param __first An input iterator.
655 * @param __last An input iterator.
656 * @param __result An output iterator.
657 * @param __value The value to be removed.
658 * @return An iterator designating the end of the resulting sequence.
659 *
660 * Copies each element in the range @p [__first,__last) not equal
661 * to @p __value to the range beginning at @p __result.
662 * remove_copy() is stable, so the relative order of elements that
663 * are copied is unchanged.
664 */
665 template<typename _InputIterator, typename _OutputIterator, typename _Tp>
666 inline _OutputIterator
667 remove_copy(_InputIterator __first, _InputIterator __last,
668 _OutputIterator __result, const _Tp& __value)
669 {
670 // concept requirements
671 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
672 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
673 typename iterator_traits<_InputIterator>::value_type>)
674 __glibcxx_function_requires(_EqualOpConcept<
675 typename iterator_traits<_InputIterator>::value_type, _Tp>)
676 __glibcxx_requires_valid_range(__first, __last);
677
678 return std::__remove_copy_if(__first, __last, __result,
679 __gnu_cxx::__ops::__iter_equals_val(__value));
680 }
681
682 /**
683 * @brief Copy a sequence, removing elements for which a predicate is true.
684 * @ingroup mutating_algorithms
685 * @param __first An input iterator.
686 * @param __last An input iterator.
687 * @param __result An output iterator.
688 * @param __pred A predicate.
689 * @return An iterator designating the end of the resulting sequence.
690 *
691 * Copies each element in the range @p [__first,__last) for which
692 * @p __pred returns false to the range beginning at @p __result.
693 *
694 * remove_copy_if() is stable, so the relative order of elements that are
695 * copied is unchanged.
696 */
697 template<typename _InputIterator, typename _OutputIterator,
698 typename _Predicate>
699 inline _OutputIterator
700 remove_copy_if(_InputIterator __first, _InputIterator __last,
701 _OutputIterator __result, _Predicate __pred)
702 {
703 // concept requirements
704 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
705 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
706 typename iterator_traits<_InputIterator>::value_type>)
707 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
708 typename iterator_traits<_InputIterator>::value_type>)
709 __glibcxx_requires_valid_range(__first, __last);
710
711 return std::__remove_copy_if(__first, __last, __result,
712 __gnu_cxx::__ops::__pred_iter(__pred));
713 }
714
715#if __cplusplus201402L >= 201103L
716 /**
717 * @brief Copy the elements of a sequence for which a predicate is true.
718 * @ingroup mutating_algorithms
719 * @param __first An input iterator.
720 * @param __last An input iterator.
721 * @param __result An output iterator.
722 * @param __pred A predicate.
723 * @return An iterator designating the end of the resulting sequence.
724 *
725 * Copies each element in the range @p [__first,__last) for which
726 * @p __pred returns true to the range beginning at @p __result.
727 *
728 * copy_if() is stable, so the relative order of elements that are
729 * copied is unchanged.
730 */
731 template<typename _InputIterator, typename _OutputIterator,
732 typename _Predicate>
733 _OutputIterator
734 copy_if(_InputIterator __first, _InputIterator __last,
735 _OutputIterator __result, _Predicate __pred)
736 {
737 // concept requirements
738 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
739 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
740 typename iterator_traits<_InputIterator>::value_type>)
741 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
742 typename iterator_traits<_InputIterator>::value_type>)
743 __glibcxx_requires_valid_range(__first, __last);
744
745 for (; __first != __last; ++__first)
746 if (__pred(*__first))
747 {
748 *__result = *__first;
749 ++__result;
750 }
751 return __result;
752 }
753
754 template<typename _InputIterator, typename _Size, typename _OutputIterator>
755 _OutputIterator
756 __copy_n(_InputIterator __first, _Size __n,
757 _OutputIterator __result, input_iterator_tag)
758 {
759 if (__n > 0)
760 {
761 while (true)
762 {
763 *__result = *__first;
764 ++__result;
765 if (--__n > 0)
766 ++__first;
767 else
768 break;
769 }
770 }
771 return __result;
772 }
773
774 template<typename _RandomAccessIterator, typename _Size,
775 typename _OutputIterator>
776 inline _OutputIterator
777 __copy_n(_RandomAccessIterator __first, _Size __n,
778 _OutputIterator __result, random_access_iterator_tag)
779 { return std::copy(__first, __first + __n, __result); }
780
781 /**
782 * @brief Copies the range [first,first+n) into [result,result+n).
783 * @ingroup mutating_algorithms
784 * @param __first An input iterator.
785 * @param __n The number of elements to copy.
786 * @param __result An output iterator.
787 * @return result+n.
788 *
789 * This inline function will boil down to a call to @c memmove whenever
790 * possible. Failing that, if random access iterators are passed, then the
791 * loop count will be known (and therefore a candidate for compiler
792 * optimizations such as unrolling).
793 */
794 template<typename _InputIterator, typename _Size, typename _OutputIterator>
795 inline _OutputIterator
796 copy_n(_InputIterator __first, _Size __n, _OutputIterator __result)
797 {
798 // concept requirements
799 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
800 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
801 typename iterator_traits<_InputIterator>::value_type>)
802
803 return std::__copy_n(__first, __n, __result,
804 std::__iterator_category(__first));
805 }
806
807 /**
808 * @brief Copy the elements of a sequence to separate output sequences
809 * depending on the truth value of a predicate.
810 * @ingroup mutating_algorithms
811 * @param __first An input iterator.
812 * @param __last An input iterator.
813 * @param __out_true An output iterator.
814 * @param __out_false An output iterator.
815 * @param __pred A predicate.
816 * @return A pair designating the ends of the resulting sequences.
817 *
818 * Copies each element in the range @p [__first,__last) for which
819 * @p __pred returns true to the range beginning at @p out_true
820 * and each element for which @p __pred returns false to @p __out_false.
821 */
822 template<typename _InputIterator, typename _OutputIterator1,
823 typename _OutputIterator2, typename _Predicate>
824 pair<_OutputIterator1, _OutputIterator2>
825 partition_copy(_InputIterator __first, _InputIterator __last,
826 _OutputIterator1 __out_true, _OutputIterator2 __out_false,
827 _Predicate __pred)
828 {
829 // concept requirements
830 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
831 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator1,
832 typename iterator_traits<_InputIterator>::value_type>)
833 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator2,
834 typename iterator_traits<_InputIterator>::value_type>)
835 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
836 typename iterator_traits<_InputIterator>::value_type>)
837 __glibcxx_requires_valid_range(__first, __last);
838
839 for (; __first != __last; ++__first)
840 if (__pred(*__first))
841 {
842 *__out_true = *__first;
843 ++__out_true;
844 }
845 else
846 {
847 *__out_false = *__first;
848 ++__out_false;
849 }
850
851 return pair<_OutputIterator1, _OutputIterator2>(__out_true, __out_false);
852 }
853#endif
854
855 template<typename _ForwardIterator, typename _Predicate>
856 _ForwardIterator
857 __remove_if(_ForwardIterator __first, _ForwardIterator __last,
858 _Predicate __pred)
859 {
860 __first = std::__find_if(__first, __last, __pred);
861 if (__first == __last)
862 return __first;
863 _ForwardIterator __result = __first;
864 ++__first;
865 for (; __first != __last; ++__first)
866 if (!__pred(__first))
867 {
868 *__result = _GLIBCXX_MOVE(*__first)std::move(*__first);
869 ++__result;
870 }
871 return __result;
872 }
873
874 /**
875 * @brief Remove elements from a sequence.
876 * @ingroup mutating_algorithms
877 * @param __first An input iterator.
878 * @param __last An input iterator.
879 * @param __value The value to be removed.
880 * @return An iterator designating the end of the resulting sequence.
881 *
882 * All elements equal to @p __value are removed from the range
883 * @p [__first,__last).
884 *
885 * remove() is stable, so the relative order of elements that are
886 * not removed is unchanged.
887 *
888 * Elements between the end of the resulting sequence and @p __last
889 * are still present, but their value is unspecified.
890 */
891 template<typename _ForwardIterator, typename _Tp>
892 inline _ForwardIterator
893 remove(_ForwardIterator __first, _ForwardIterator __last,
894 const _Tp& __value)
895 {
896 // concept requirements
897 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
898 _ForwardIterator>)
899 __glibcxx_function_requires(_EqualOpConcept<
900 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
901 __glibcxx_requires_valid_range(__first, __last);
902
903 return std::__remove_if(__first, __last,
904 __gnu_cxx::__ops::__iter_equals_val(__value));
905 }
906
907 /**
908 * @brief Remove elements from a sequence using a predicate.
909 * @ingroup mutating_algorithms
910 * @param __first A forward iterator.
911 * @param __last A forward iterator.
912 * @param __pred A predicate.
913 * @return An iterator designating the end of the resulting sequence.
914 *
915 * All elements for which @p __pred returns true are removed from the range
916 * @p [__first,__last).
917 *
918 * remove_if() is stable, so the relative order of elements that are
919 * not removed is unchanged.
920 *
921 * Elements between the end of the resulting sequence and @p __last
922 * are still present, but their value is unspecified.
923 */
924 template<typename _ForwardIterator, typename _Predicate>
925 inline _ForwardIterator
926 remove_if(_ForwardIterator __first, _ForwardIterator __last,
927 _Predicate __pred)
928 {
929 // concept requirements
930 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
931 _ForwardIterator>)
932 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
933 typename iterator_traits<_ForwardIterator>::value_type>)
934 __glibcxx_requires_valid_range(__first, __last);
935
936 return std::__remove_if(__first, __last,
937 __gnu_cxx::__ops::__pred_iter(__pred));
938 }
939
940 template<typename _ForwardIterator, typename _BinaryPredicate>
941 _ForwardIterator
942 __adjacent_find(_ForwardIterator __first, _ForwardIterator __last,
943 _BinaryPredicate __binary_pred)
944 {
945 if (__first == __last)
946 return __last;
947 _ForwardIterator __next = __first;
948 while (++__next != __last)
949 {
950 if (__binary_pred(__first, __next))
951 return __first;
952 __first = __next;
953 }
954 return __last;
955 }
956
957 template<typename _ForwardIterator, typename _BinaryPredicate>
958 _ForwardIterator
959 __unique(_ForwardIterator __first, _ForwardIterator __last,
960 _BinaryPredicate __binary_pred)
961 {
962 // Skip the beginning, if already unique.
963 __first = std::__adjacent_find(__first, __last, __binary_pred);
964 if (__first == __last)
965 return __last;
966
967 // Do the real copy work.
968 _ForwardIterator __dest = __first;
969 ++__first;
970 while (++__first != __last)
971 if (!__binary_pred(__dest, __first))
972 *++__dest = _GLIBCXX_MOVE(*__first)std::move(*__first);
973 return ++__dest;
974 }
975
976 /**
977 * @brief Remove consecutive duplicate values from a sequence.
978 * @ingroup mutating_algorithms
979 * @param __first A forward iterator.
980 * @param __last A forward iterator.
981 * @return An iterator designating the end of the resulting sequence.
982 *
983 * Removes all but the first element from each group of consecutive
984 * values that compare equal.
985 * unique() is stable, so the relative order of elements that are
986 * not removed is unchanged.
987 * Elements between the end of the resulting sequence and @p __last
988 * are still present, but their value is unspecified.
989 */
990 template<typename _ForwardIterator>
991 inline _ForwardIterator
992 unique(_ForwardIterator __first, _ForwardIterator __last)
993 {
994 // concept requirements
995 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
996 _ForwardIterator>)
997 __glibcxx_function_requires(_EqualityComparableConcept<
998 typename iterator_traits<_ForwardIterator>::value_type>)
999 __glibcxx_requires_valid_range(__first, __last);
1000
1001 return std::__unique(__first, __last,
1002 __gnu_cxx::__ops::__iter_equal_to_iter());
1003 }
1004
1005 /**
1006 * @brief Remove consecutive values from a sequence using a predicate.
1007 * @ingroup mutating_algorithms
1008 * @param __first A forward iterator.
1009 * @param __last A forward iterator.
1010 * @param __binary_pred A binary predicate.
1011 * @return An iterator designating the end of the resulting sequence.
1012 *
1013 * Removes all but the first element from each group of consecutive
1014 * values for which @p __binary_pred returns true.
1015 * unique() is stable, so the relative order of elements that are
1016 * not removed is unchanged.
1017 * Elements between the end of the resulting sequence and @p __last
1018 * are still present, but their value is unspecified.
1019 */
1020 template<typename _ForwardIterator, typename _BinaryPredicate>
1021 inline _ForwardIterator
1022 unique(_ForwardIterator __first, _ForwardIterator __last,
1023 _BinaryPredicate __binary_pred)
1024 {
1025 // concept requirements
1026 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
1027 _ForwardIterator>)
1028 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
1029 typename iterator_traits<_ForwardIterator>::value_type,
1030 typename iterator_traits<_ForwardIterator>::value_type>)
1031 __glibcxx_requires_valid_range(__first, __last);
1032
1033 return std::__unique(__first, __last,
1034 __gnu_cxx::__ops::__iter_comp_iter(__binary_pred));
1035 }
1036
1037 /**
1038 * This is an uglified
1039 * unique_copy(_InputIterator, _InputIterator, _OutputIterator,
1040 * _BinaryPredicate)
1041 * overloaded for forward iterators and output iterator as result.
1042 */
1043 template<typename _ForwardIterator, typename _OutputIterator,
1044 typename _BinaryPredicate>
1045 _OutputIterator
1046 __unique_copy(_ForwardIterator __first, _ForwardIterator __last,
1047 _OutputIterator __result, _BinaryPredicate __binary_pred,
1048 forward_iterator_tag, output_iterator_tag)
1049 {
1050 // concept requirements -- iterators already checked
1051 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
1052 typename iterator_traits<_ForwardIterator>::value_type,
1053 typename iterator_traits<_ForwardIterator>::value_type>)
1054
1055 _ForwardIterator __next = __first;
1056 *__result = *__first;
1057 while (++__next != __last)
1058 if (!__binary_pred(__first, __next))
1059 {
1060 __first = __next;
1061 *++__result = *__first;
1062 }
1063 return ++__result;
1064 }
1065
1066 /**
1067 * This is an uglified
1068 * unique_copy(_InputIterator, _InputIterator, _OutputIterator,
1069 * _BinaryPredicate)
1070 * overloaded for input iterators and output iterator as result.
1071 */
1072 template<typename _InputIterator, typename _OutputIterator,
1073 typename _BinaryPredicate>
1074 _OutputIterator
1075 __unique_copy(_InputIterator __first, _InputIterator __last,
1076 _OutputIterator __result, _BinaryPredicate __binary_pred,
1077 input_iterator_tag, output_iterator_tag)
1078 {
1079 // concept requirements -- iterators already checked
1080 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
1081 typename iterator_traits<_InputIterator>::value_type,
1082 typename iterator_traits<_InputIterator>::value_type>)
1083
1084 typename iterator_traits<_InputIterator>::value_type __value = *__first;
1085 __decltype(__gnu_cxx::__ops::__iter_comp_val(__binary_pred))
1086 __rebound_pred
1087 = __gnu_cxx::__ops::__iter_comp_val(__binary_pred);
1088 *__result = __value;
1089 while (++__first != __last)
1090 if (!__rebound_pred(__first, __value))
1091 {
1092 __value = *__first;
1093 *++__result = __value;
1094 }
1095 return ++__result;
1096 }
1097
1098 /**
1099 * This is an uglified
1100 * unique_copy(_InputIterator, _InputIterator, _OutputIterator,
1101 * _BinaryPredicate)
1102 * overloaded for input iterators and forward iterator as result.
1103 */
1104 template<typename _InputIterator, typename _ForwardIterator,
1105 typename _BinaryPredicate>
1106 _ForwardIterator
1107 __unique_copy(_InputIterator __first, _InputIterator __last,
1108 _ForwardIterator __result, _BinaryPredicate __binary_pred,
1109 input_iterator_tag, forward_iterator_tag)
1110 {
1111 // concept requirements -- iterators already checked
1112 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
1113 typename iterator_traits<_ForwardIterator>::value_type,
1114 typename iterator_traits<_InputIterator>::value_type>)
1115 *__result = *__first;
1116 while (++__first != __last)
1117 if (!__binary_pred(__result, __first))
1118 *++__result = *__first;
1119 return ++__result;
1120 }
1121
1122 /**
1123 * This is an uglified reverse(_BidirectionalIterator,
1124 * _BidirectionalIterator)
1125 * overloaded for bidirectional iterators.
1126 */
1127 template<typename _BidirectionalIterator>
1128 void
1129 __reverse(_BidirectionalIterator __first, _BidirectionalIterator __last,
1130 bidirectional_iterator_tag)
1131 {
1132 while (true)
1133 if (__first == __last || __first == --__last)
1134 return;
1135 else
1136 {
1137 std::iter_swap(__first, __last);
1138 ++__first;
1139 }
1140 }
1141
1142 /**
1143 * This is an uglified reverse(_BidirectionalIterator,
1144 * _BidirectionalIterator)
1145 * overloaded for random access iterators.
1146 */
1147 template<typename _RandomAccessIterator>
1148 void
1149 __reverse(_RandomAccessIterator __first, _RandomAccessIterator __last,
1150 random_access_iterator_tag)
1151 {
1152 if (__first == __last)
1153 return;
1154 --__last;
1155 while (__first < __last)
1156 {
1157 std::iter_swap(__first, __last);
1158 ++__first;
1159 --__last;
1160 }
1161 }
1162
1163 /**
1164 * @brief Reverse a sequence.
1165 * @ingroup mutating_algorithms
1166 * @param __first A bidirectional iterator.
1167 * @param __last A bidirectional iterator.
1168 * @return reverse() returns no value.
1169 *
1170 * Reverses the order of the elements in the range @p [__first,__last),
1171 * so that the first element becomes the last etc.
1172 * For every @c i such that @p 0<=i<=(__last-__first)/2), @p reverse()
1173 * swaps @p *(__first+i) and @p *(__last-(i+1))
1174 */
1175 template<typename _BidirectionalIterator>
1176 inline void
1177 reverse(_BidirectionalIterator __first, _BidirectionalIterator __last)
1178 {
1179 // concept requirements
1180 __glibcxx_function_requires(_Mutable_BidirectionalIteratorConcept<
1181 _BidirectionalIterator>)
1182 __glibcxx_requires_valid_range(__first, __last);
1183 std::__reverse(__first, __last, std::__iterator_category(__first));
1184 }
1185
1186 /**
1187 * @brief Copy a sequence, reversing its elements.
1188 * @ingroup mutating_algorithms
1189 * @param __first A bidirectional iterator.
1190 * @param __last A bidirectional iterator.
1191 * @param __result An output iterator.
1192 * @return An iterator designating the end of the resulting sequence.
1193 *
1194 * Copies the elements in the range @p [__first,__last) to the
1195 * range @p [__result,__result+(__last-__first)) such that the
1196 * order of the elements is reversed. For every @c i such that @p
1197 * 0<=i<=(__last-__first), @p reverse_copy() performs the
1198 * assignment @p *(__result+(__last-__first)-1-i) = *(__first+i).
1199 * The ranges @p [__first,__last) and @p
1200 * [__result,__result+(__last-__first)) must not overlap.
1201 */
1202 template<typename _BidirectionalIterator, typename _OutputIterator>
1203 _OutputIterator
1204 reverse_copy(_BidirectionalIterator __first, _BidirectionalIterator __last,
1205 _OutputIterator __result)
1206 {
1207 // concept requirements
1208 __glibcxx_function_requires(_BidirectionalIteratorConcept<
1209 _BidirectionalIterator>)
1210 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
1211 typename iterator_traits<_BidirectionalIterator>::value_type>)
1212 __glibcxx_requires_valid_range(__first, __last);
1213
1214 while (__first != __last)
1215 {
1216 --__last;
1217 *__result = *__last;
1218 ++__result;
1219 }
1220 return __result;
1221 }
1222
1223 /**
1224 * This is a helper function for the rotate algorithm specialized on RAIs.
1225 * It returns the greatest common divisor of two integer values.
1226 */
1227 template<typename _EuclideanRingElement>
1228 _EuclideanRingElement
1229 __gcd(_EuclideanRingElement __m, _EuclideanRingElement __n)
1230 {
1231 while (__n != 0)
1232 {
1233 _EuclideanRingElement __t = __m % __n;
1234 __m = __n;
1235 __n = __t;
1236 }
1237 return __m;
1238 }
1239
1240 inline namespace _V2
1241 {
1242
1243 /// This is a helper function for the rotate algorithm.
1244 template<typename _ForwardIterator>
1245 _ForwardIterator
1246 __rotate(_ForwardIterator __first,
1247 _ForwardIterator __middle,
1248 _ForwardIterator __last,
1249 forward_iterator_tag)
1250 {
1251 if (__first == __middle)
1252 return __last;
1253 else if (__last == __middle)
1254 return __first;
1255
1256 _ForwardIterator __first2 = __middle;
1257 do
1258 {
1259 std::iter_swap(__first, __first2);
1260 ++__first;
1261 ++__first2;
1262 if (__first == __middle)
1263 __middle = __first2;
1264 }
1265 while (__first2 != __last);
1266
1267 _ForwardIterator __ret = __first;
1268
1269 __first2 = __middle;
1270
1271 while (__first2 != __last)
1272 {
1273 std::iter_swap(__first, __first2);
1274 ++__first;
1275 ++__first2;
1276 if (__first == __middle)
1277 __middle = __first2;
1278 else if (__first2 == __last)
1279 __first2 = __middle;
1280 }
1281 return __ret;
1282 }
1283
1284 /// This is a helper function for the rotate algorithm.
1285 template<typename _BidirectionalIterator>
1286 _BidirectionalIterator
1287 __rotate(_BidirectionalIterator __first,
1288 _BidirectionalIterator __middle,
1289 _BidirectionalIterator __last,
1290 bidirectional_iterator_tag)
1291 {
1292 // concept requirements
1293 __glibcxx_function_requires(_Mutable_BidirectionalIteratorConcept<
1294 _BidirectionalIterator>)
1295
1296 if (__first == __middle)
1297 return __last;
1298 else if (__last == __middle)
1299 return __first;
1300
1301 std::__reverse(__first, __middle, bidirectional_iterator_tag());
1302 std::__reverse(__middle, __last, bidirectional_iterator_tag());
1303
1304 while (__first != __middle && __middle != __last)
1305 {
1306 std::iter_swap(__first, --__last);
1307 ++__first;
1308 }
1309
1310 if (__first == __middle)
1311 {
1312 std::__reverse(__middle, __last, bidirectional_iterator_tag());
1313 return __last;
1314 }
1315 else
1316 {
1317 std::__reverse(__first, __middle, bidirectional_iterator_tag());
1318 return __first;
1319 }
1320 }
1321
1322 /// This is a helper function for the rotate algorithm.
1323 template<typename _RandomAccessIterator>
1324 _RandomAccessIterator
1325 __rotate(_RandomAccessIterator __first,
1326 _RandomAccessIterator __middle,
1327 _RandomAccessIterator __last,
1328 random_access_iterator_tag)
1329 {
1330 // concept requirements
1331 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
1332 _RandomAccessIterator>)
1333
1334 if (__first == __middle)
1335 return __last;
1336 else if (__last == __middle)
1337 return __first;
1338
1339 typedef typename iterator_traits<_RandomAccessIterator>::difference_type
1340 _Distance;
1341 typedef typename iterator_traits<_RandomAccessIterator>::value_type
1342 _ValueType;
1343
1344 _Distance __n = __last - __first;
1345 _Distance __k = __middle - __first;
1346
1347 if (__k == __n - __k)
1348 {
1349 std::swap_ranges(__first, __middle, __middle);
1350 return __middle;
1351 }
1352
1353 _RandomAccessIterator __p = __first;
1354 _RandomAccessIterator __ret = __first + (__last - __middle);
1355
1356 for (;;)
1357 {
1358 if (__k < __n - __k)
1359 {
1360 if (__is_pod(_ValueType) && __k == 1)
1361 {
1362 _ValueType __t = _GLIBCXX_MOVE(*__p)std::move(*__p);
1363 _GLIBCXX_MOVE3(__p + 1, __p + __n, __p)std::move(__p + 1, __p + __n, __p);
1364 *(__p + __n - 1) = _GLIBCXX_MOVE(__t)std::move(__t);
1365 return __ret;
1366 }
1367 _RandomAccessIterator __q = __p + __k;
1368 for (_Distance __i = 0; __i < __n - __k; ++ __i)
1369 {
1370 std::iter_swap(__p, __q);
1371 ++__p;
1372 ++__q;
1373 }
1374 __n %= __k;
1375 if (__n == 0)
1376 return __ret;
1377 std::swap(__n, __k);
1378 __k = __n - __k;
1379 }
1380 else
1381 {
1382 __k = __n - __k;
1383 if (__is_pod(_ValueType) && __k == 1)
1384 {
1385 _ValueType __t = _GLIBCXX_MOVE(*(__p + __n - 1))std::move(*(__p + __n - 1));
1386 _GLIBCXX_MOVE_BACKWARD3(__p, __p + __n - 1, __p + __n)std::move_backward(__p, __p + __n - 1, __p + __n);
1387 *__p = _GLIBCXX_MOVE(__t)std::move(__t);
1388 return __ret;
1389 }
1390 _RandomAccessIterator __q = __p + __n;
1391 __p = __q - __k;
1392 for (_Distance __i = 0; __i < __n - __k; ++ __i)
1393 {
1394 --__p;
1395 --__q;
1396 std::iter_swap(__p, __q);
1397 }
1398 __n %= __k;
1399 if (__n == 0)
1400 return __ret;
1401 std::swap(__n, __k);
1402 }
1403 }
1404 }
1405
1406 // _GLIBCXX_RESOLVE_LIB_DEFECTS
1407 // DR 488. rotate throws away useful information
1408 /**
1409 * @brief Rotate the elements of a sequence.
1410 * @ingroup mutating_algorithms
1411 * @param __first A forward iterator.
1412 * @param __middle A forward iterator.
1413 * @param __last A forward iterator.
1414 * @return first + (last - middle).
1415 *
1416 * Rotates the elements of the range @p [__first,__last) by
1417 * @p (__middle - __first) positions so that the element at @p __middle
1418 * is moved to @p __first, the element at @p __middle+1 is moved to
1419 * @p __first+1 and so on for each element in the range
1420 * @p [__first,__last).
1421 *
1422 * This effectively swaps the ranges @p [__first,__middle) and
1423 * @p [__middle,__last).
1424 *
1425 * Performs
1426 * @p *(__first+(n+(__last-__middle))%(__last-__first))=*(__first+n)
1427 * for each @p n in the range @p [0,__last-__first).
1428 */
1429 template<typename _ForwardIterator>
1430 inline _ForwardIterator
1431 rotate(_ForwardIterator __first, _ForwardIterator __middle,
1432 _ForwardIterator __last)
1433 {
1434 // concept requirements
1435 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
1436 _ForwardIterator>)
1437 __glibcxx_requires_valid_range(__first, __middle);
1438 __glibcxx_requires_valid_range(__middle, __last);
1439
1440 return std::__rotate(__first, __middle, __last,
1441 std::__iterator_category(__first));
1442 }
1443
1444 } // namespace _V2
1445
1446 /**
1447 * @brief Copy a sequence, rotating its elements.
1448 * @ingroup mutating_algorithms
1449 * @param __first A forward iterator.
1450 * @param __middle A forward iterator.
1451 * @param __last A forward iterator.
1452 * @param __result An output iterator.
1453 * @return An iterator designating the end of the resulting sequence.
1454 *
1455 * Copies the elements of the range @p [__first,__last) to the
1456 * range beginning at @result, rotating the copied elements by
1457 * @p (__middle-__first) positions so that the element at @p __middle
1458 * is moved to @p __result, the element at @p __middle+1 is moved
1459 * to @p __result+1 and so on for each element in the range @p
1460 * [__first,__last).
1461 *
1462 * Performs
1463 * @p *(__result+(n+(__last-__middle))%(__last-__first))=*(__first+n)
1464 * for each @p n in the range @p [0,__last-__first).
1465 */
1466 template<typename _ForwardIterator, typename _OutputIterator>
1467 inline _OutputIterator
1468 rotate_copy(_ForwardIterator __first, _ForwardIterator __middle,
1469 _ForwardIterator __last, _OutputIterator __result)
1470 {
1471 // concept requirements
1472 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
1473 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
1474 typename iterator_traits<_ForwardIterator>::value_type>)
1475 __glibcxx_requires_valid_range(__first, __middle);
1476 __glibcxx_requires_valid_range(__middle, __last);
1477
1478 return std::copy(__first, __middle,
1479 std::copy(__middle, __last, __result));
1480 }
1481
1482 /// This is a helper function...
1483 template<typename _ForwardIterator, typename _Predicate>
1484 _ForwardIterator
1485 __partition(_ForwardIterator __first, _ForwardIterator __last,
1486 _Predicate __pred, forward_iterator_tag)
1487 {
1488 if (__first == __last)
1489 return __first;
1490
1491 while (__pred(*__first))
1492 if (++__first == __last)
1493 return __first;
1494
1495 _ForwardIterator __next = __first;
1496
1497 while (++__next != __last)
1498 if (__pred(*__next))
1499 {
1500 std::iter_swap(__first, __next);
1501 ++__first;
1502 }
1503
1504 return __first;
1505 }
1506
1507 /// This is a helper function...
1508 template<typename _BidirectionalIterator, typename _Predicate>
1509 _BidirectionalIterator
1510 __partition(_BidirectionalIterator __first, _BidirectionalIterator __last,
1511 _Predicate __pred, bidirectional_iterator_tag)
1512 {
1513 while (true)
1514 {
1515 while (true)
1516 if (__first == __last)
1517 return __first;
1518 else if (__pred(*__first))
1519 ++__first;
1520 else
1521 break;
1522 --__last;
1523 while (true)
1524 if (__first == __last)
1525 return __first;
1526 else if (!bool(__pred(*__last)))
1527 --__last;
1528 else
1529 break;
1530 std::iter_swap(__first, __last);
1531 ++__first;
1532 }
1533 }
1534
1535 // partition
1536
1537 /// This is a helper function...
1538 /// Requires __first != __last and !__pred(__first)
1539 /// and __len == distance(__first, __last).
1540 ///
1541 /// !__pred(__first) allows us to guarantee that we don't
1542 /// move-assign an element onto itself.
1543 template<typename _ForwardIterator, typename _Pointer, typename _Predicate,
1544 typename _Distance>
1545 _ForwardIterator
1546 __stable_partition_adaptive(_ForwardIterator __first,
1547 _ForwardIterator __last,
1548 _Predicate __pred, _Distance __len,
1549 _Pointer __buffer,
1550 _Distance __buffer_size)
1551 {
1552 if (__len == 1)
1553 return __first;
1554
1555 if (__len <= __buffer_size)
1556 {
1557 _ForwardIterator __result1 = __first;
1558 _Pointer __result2 = __buffer;
1559
1560 // The precondition guarantees that !__pred(__first), so
1561 // move that element to the buffer before starting the loop.
1562 // This ensures that we only call __pred once per element.
1563 *__result2 = _GLIBCXX_MOVE(*__first)std::move(*__first);
1564 ++__result2;
1565 ++__first;
1566 for (; __first != __last; ++__first)
1567 if (__pred(__first))
1568 {
1569 *__result1 = _GLIBCXX_MOVE(*__first)std::move(*__first);
1570 ++__result1;
1571 }
1572 else
1573 {
1574 *__result2 = _GLIBCXX_MOVE(*__first)std::move(*__first);
1575 ++__result2;
1576 }
1577
1578 _GLIBCXX_MOVE3(__buffer, __result2, __result1)std::move(__buffer, __result2, __result1);
1579 return __result1;
1580 }
1581
1582 _ForwardIterator __middle = __first;
1583 std::advance(__middle, __len / 2);
1584 _ForwardIterator __left_split =
1585 std::__stable_partition_adaptive(__first, __middle, __pred,
1586 __len / 2, __buffer,
1587 __buffer_size);
1588
1589 // Advance past true-predicate values to satisfy this
1590 // function's preconditions.
1591 _Distance __right_len = __len - __len / 2;
1592 _ForwardIterator __right_split =
1593 std::__find_if_not_n(__middle, __right_len, __pred);
1594
1595 if (__right_len)
1596 __right_split =
1597 std::__stable_partition_adaptive(__right_split, __last, __pred,
1598 __right_len,
1599 __buffer, __buffer_size);
1600
1601 std::rotate(__left_split, __middle, __right_split);
1602 std::advance(__left_split, std::distance(__middle, __right_split));
1603 return __left_split;
1604 }
1605
1606 template<typename _ForwardIterator, typename _Predicate>
1607 _ForwardIterator
1608 __stable_partition(_ForwardIterator __first, _ForwardIterator __last,
1609 _Predicate __pred)
1610 {
1611 __first = std::__find_if_not(__first, __last, __pred);
1612
1613 if (__first == __last)
1614 return __first;
1615
1616 typedef typename iterator_traits<_ForwardIterator>::value_type
1617 _ValueType;
1618 typedef typename iterator_traits<_ForwardIterator>::difference_type
1619 _DistanceType;
1620
1621 _Temporary_buffer<_ForwardIterator, _ValueType> __buf(__first, __last);
1622 return
1623 std::__stable_partition_adaptive(__first, __last, __pred,
1624 _DistanceType(__buf.requested_size()),
1625 __buf.begin(),
1626 _DistanceType(__buf.size()));
1627 }
1628
1629 /**
1630 * @brief Move elements for which a predicate is true to the beginning
1631 * of a sequence, preserving relative ordering.
1632 * @ingroup mutating_algorithms
1633 * @param __first A forward iterator.
1634 * @param __last A forward iterator.
1635 * @param __pred A predicate functor.
1636 * @return An iterator @p middle such that @p __pred(i) is true for each
1637 * iterator @p i in the range @p [first,middle) and false for each @p i
1638 * in the range @p [middle,last).
1639 *
1640 * Performs the same function as @p partition() with the additional
1641 * guarantee that the relative ordering of elements in each group is
1642 * preserved, so any two elements @p x and @p y in the range
1643 * @p [__first,__last) such that @p __pred(x)==__pred(y) will have the same
1644 * relative ordering after calling @p stable_partition().
1645 */
1646 template<typename _ForwardIterator, typename _Predicate>
1647 inline _ForwardIterator
1648 stable_partition(_ForwardIterator __first, _ForwardIterator __last,
1649 _Predicate __pred)
1650 {
1651 // concept requirements
1652 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
1653 _ForwardIterator>)
1654 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
1655 typename iterator_traits<_ForwardIterator>::value_type>)
1656 __glibcxx_requires_valid_range(__first, __last);
1657
1658 return std::__stable_partition(__first, __last,
1659 __gnu_cxx::__ops::__pred_iter(__pred));
1660 }
1661
1662 /// This is a helper function for the sort routines.
1663 template<typename _RandomAccessIterator, typename _Compare>
1664 void
1665 __heap_select(_RandomAccessIterator __first,
1666 _RandomAccessIterator __middle,
1667 _RandomAccessIterator __last, _Compare __comp)
1668 {
1669 std::__make_heap(__first, __middle, __comp);
1670 for (_RandomAccessIterator __i = __middle; __i < __last; ++__i)
1671 if (__comp(__i, __first))
1672 std::__pop_heap(__first, __middle, __i, __comp);
1673 }
1674
1675 // partial_sort
1676
1677 template<typename _InputIterator, typename _RandomAccessIterator,
1678 typename _Compare>
1679 _RandomAccessIterator
1680 __partial_sort_copy(_InputIterator __first, _InputIterator __last,
1681 _RandomAccessIterator __result_first,
1682 _RandomAccessIterator __result_last,
1683 _Compare __comp)
1684 {
1685 typedef typename iterator_traits<_InputIterator>::value_type
1686 _InputValueType;
1687 typedef iterator_traits<_RandomAccessIterator> _RItTraits;
1688 typedef typename _RItTraits::difference_type _DistanceType;
1689
1690 if (__result_first == __result_last)
1691 return __result_last;
1692 _RandomAccessIterator __result_real_last = __result_first;
1693 while (__first != __last && __result_real_last != __result_last)
1694 {
1695 *__result_real_last = *__first;
1696 ++__result_real_last;
1697 ++__first;
1698 }
1699
1700 std::__make_heap(__result_first, __result_real_last, __comp);
1701 while (__first != __last)
1702 {
1703 if (__comp(__first, __result_first))
1704 std::__adjust_heap(__result_first, _DistanceType(0),
1705 _DistanceType(__result_real_last
1706 - __result_first),
1707 _InputValueType(*__first), __comp);
1708 ++__first;
1709 }
1710 std::__sort_heap(__result_first, __result_real_last, __comp);
1711 return __result_real_last;
1712 }
1713
1714 /**
1715 * @brief Copy the smallest elements of a sequence.
1716 * @ingroup sorting_algorithms
1717 * @param __first An iterator.
1718 * @param __last Another iterator.
1719 * @param __result_first A random-access iterator.
1720 * @param __result_last Another random-access iterator.
1721 * @return An iterator indicating the end of the resulting sequence.
1722 *
1723 * Copies and sorts the smallest N values from the range @p [__first,__last)
1724 * to the range beginning at @p __result_first, where the number of
1725 * elements to be copied, @p N, is the smaller of @p (__last-__first) and
1726 * @p (__result_last-__result_first).
1727 * After the sort if @e i and @e j are iterators in the range
1728 * @p [__result_first,__result_first+N) such that i precedes j then
1729 * *j<*i is false.
1730 * The value returned is @p __result_first+N.
1731 */
1732 template<typename _InputIterator, typename _RandomAccessIterator>
1733 inline _RandomAccessIterator
1734 partial_sort_copy(_InputIterator __first, _InputIterator __last,
1735 _RandomAccessIterator __result_first,
1736 _RandomAccessIterator __result_last)
1737 {
1738#ifdef _GLIBCXX_CONCEPT_CHECKS
1739 typedef typename iterator_traits<_InputIterator>::value_type
1740 _InputValueType;
1741 typedef typename iterator_traits<_RandomAccessIterator>::value_type
1742 _OutputValueType;
1743#endif
1744
1745 // concept requirements
1746 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
1747 __glibcxx_function_requires(_ConvertibleConcept<_InputValueType,
1748 _OutputValueType>)
1749 __glibcxx_function_requires(_LessThanOpConcept<_InputValueType,
1750 _OutputValueType>)
1751 __glibcxx_function_requires(_LessThanComparableConcept<_OutputValueType>)
1752 __glibcxx_requires_valid_range(__first, __last);
1753 __glibcxx_requires_irreflexive(__first, __last);
1754 __glibcxx_requires_valid_range(__result_first, __result_last);
1755
1756 return std::__partial_sort_copy(__first, __last,
1757 __result_first, __result_last,
1758 __gnu_cxx::__ops::__iter_less_iter());
1759 }
1760
1761 /**
1762 * @brief Copy the smallest elements of a sequence using a predicate for
1763 * comparison.
1764 * @ingroup sorting_algorithms
1765 * @param __first An input iterator.
1766 * @param __last Another input iterator.
1767 * @param __result_first A random-access iterator.
1768 * @param __result_last Another random-access iterator.
1769 * @param __comp A comparison functor.
1770 * @return An iterator indicating the end of the resulting sequence.
1771 *
1772 * Copies and sorts the smallest N values from the range @p [__first,__last)
1773 * to the range beginning at @p result_first, where the number of
1774 * elements to be copied, @p N, is the smaller of @p (__last-__first) and
1775 * @p (__result_last-__result_first).
1776 * After the sort if @e i and @e j are iterators in the range
1777 * @p [__result_first,__result_first+N) such that i precedes j then
1778 * @p __comp(*j,*i) is false.
1779 * The value returned is @p __result_first+N.
1780 */
1781 template<typename _InputIterator, typename _RandomAccessIterator,
1782 typename _Compare>
1783 inline _RandomAccessIterator
1784 partial_sort_copy(_InputIterator __first, _InputIterator __last,
1785 _RandomAccessIterator __result_first,
1786 _RandomAccessIterator __result_last,
1787 _Compare __comp)
1788 {
1789#ifdef _GLIBCXX_CONCEPT_CHECKS
1790 typedef typename iterator_traits<_InputIterator>::value_type
1791 _InputValueType;
1792 typedef typename iterator_traits<_RandomAccessIterator>::value_type
1793 _OutputValueType;
1794#endif
1795
1796 // concept requirements
1797 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
1798 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
1799 _RandomAccessIterator>)
1800 __glibcxx_function_requires(_ConvertibleConcept<_InputValueType,
1801 _OutputValueType>)
1802 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
1803 _InputValueType, _OutputValueType>)
1804 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
1805 _OutputValueType, _OutputValueType>)
1806 __glibcxx_requires_valid_range(__first, __last);
1807 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
1808 __glibcxx_requires_valid_range(__result_first, __result_last);
1809
1810 return std::__partial_sort_copy(__first, __last,
1811 __result_first, __result_last,
1812 __gnu_cxx::__ops::__iter_comp_iter(__comp));
1813 }
1814
1815 /// This is a helper function for the sort routine.
1816 template<typename _RandomAccessIterator, typename _Compare>
1817 void
1818 __unguarded_linear_insert(_RandomAccessIterator __last,
1819 _Compare __comp)
1820 {
1821 typename iterator_traits<_RandomAccessIterator>::value_type
1822 __val = _GLIBCXX_MOVE(*__last)std::move(*__last);
1823 _RandomAccessIterator __next = __last;
1824 --__next;
1825 while (__comp(__val, __next))
1826 {
1827 *__last = _GLIBCXX_MOVE(*__next)std::move(*__next);
1828 __last = __next;
1829 --__next;
1830 }
1831 *__last = _GLIBCXX_MOVE(__val)std::move(__val);
1832 }
1833
1834 /// This is a helper function for the sort routine.
1835 template<typename _RandomAccessIterator, typename _Compare>
1836 void
1837 __insertion_sort(_RandomAccessIterator __first,
1838 _RandomAccessIterator __last, _Compare __comp)
1839 {
1840 if (__first == __last) return;
1841
1842 for (_RandomAccessIterator __i = __first + 1; __i != __last; ++__i)
1843 {
1844 if (__comp(__i, __first))
1845 {
1846 typename iterator_traits<_RandomAccessIterator>::value_type
1847 __val = _GLIBCXX_MOVE(*__i)std::move(*__i);
1848 _GLIBCXX_MOVE_BACKWARD3(__first, __i, __i + 1)std::move_backward(__first, __i, __i + 1);
1849 *__first = _GLIBCXX_MOVE(__val)std::move(__val);
1850 }
1851 else
1852 std::__unguarded_linear_insert(__i,
1853 __gnu_cxx::__ops::__val_comp_iter(__comp));
1854 }
1855 }
1856
1857 /// This is a helper function for the sort routine.
1858 template<typename _RandomAccessIterator, typename _Compare>
1859 inline void
1860 __unguarded_insertion_sort(_RandomAccessIterator __first,
1861 _RandomAccessIterator __last, _Compare __comp)
1862 {
1863 for (_RandomAccessIterator __i = __first; __i != __last; ++__i)
1864 std::__unguarded_linear_insert(__i,
1865 __gnu_cxx::__ops::__val_comp_iter(__comp));
1866 }
1867
1868 /**
1869 * @doctodo
1870 * This controls some aspect of the sort routines.
1871 */
1872 enum { _S_threshold = 16 };
1873
1874 /// This is a helper function for the sort routine.
1875 template<typename _RandomAccessIterator, typename _Compare>
1876 void
1877 __final_insertion_sort(_RandomAccessIterator __first,
1878 _RandomAccessIterator __last, _Compare __comp)
1879 {
1880 if (__last - __first > int(_S_threshold))
1881 {
1882 std::__insertion_sort(__first, __first + int(_S_threshold), __comp);
1883 std::__unguarded_insertion_sort(__first + int(_S_threshold), __last,
1884 __comp);
1885 }
1886 else
1887 std::__insertion_sort(__first, __last, __comp);
1888 }
1889
1890 /// This is a helper function...
1891 template<typename _RandomAccessIterator, typename _Compare>
1892 _RandomAccessIterator
1893 __unguarded_partition(_RandomAccessIterator __first,
1894 _RandomAccessIterator __last,
1895 _RandomAccessIterator __pivot, _Compare __comp)
1896 {
1897 while (true)
1898 {
1899 while (__comp(__first, __pivot))
1900 ++__first;
1901 --__last;
1902 while (__comp(__pivot, __last))
1903 --__last;
1904 if (!(__first < __last))
1905 return __first;
1906 std::iter_swap(__first, __last);
1907 ++__first;
1908 }
1909 }
1910
1911 /// This is a helper function...
1912 template<typename _RandomAccessIterator, typename _Compare>
1913 inline _RandomAccessIterator
1914 __unguarded_partition_pivot(_RandomAccessIterator __first,
1915 _RandomAccessIterator __last, _Compare __comp)
1916 {
1917 _RandomAccessIterator __mid = __first + (__last - __first) / 2;
1918 std::__move_median_to_first(__first, __first + 1, __mid, __last - 1,
1919 __comp);
1920 return std::__unguarded_partition(__first + 1, __last, __first, __comp);
1921 }
1922
1923 template<typename _RandomAccessIterator, typename _Compare>
1924 inline void
1925 __partial_sort(_RandomAccessIterator __first,
1926 _RandomAccessIterator __middle,
1927 _RandomAccessIterator __last,
1928 _Compare __comp)
1929 {
1930 std::__heap_select(__first, __middle, __last, __comp);
1931 std::__sort_heap(__first, __middle, __comp);
1932 }
1933
1934 /// This is a helper function for the sort routine.
1935 template<typename _RandomAccessIterator, typename _Size, typename _Compare>
1936 void
1937 __introsort_loop(_RandomAccessIterator __first,
1938 _RandomAccessIterator __last,
1939 _Size __depth_limit, _Compare __comp)
1940 {
1941 while (__last - __first > int(_S_threshold))
1942 {
1943 if (__depth_limit == 0)
1944 {
1945 std::__partial_sort(__first, __last, __last, __comp);
1946 return;
1947 }
1948 --__depth_limit;
1949 _RandomAccessIterator __cut =
1950 std::__unguarded_partition_pivot(__first, __last, __comp);
1951 std::__introsort_loop(__cut, __last, __depth_limit, __comp);
1952 __last = __cut;
1953 }
1954 }
1955
1956 // sort
1957
1958 template<typename _RandomAccessIterator, typename _Compare>
1959 inline void
1960 __sort(_RandomAccessIterator __first, _RandomAccessIterator __last,
1961 _Compare __comp)
1962 {
1963 if (__first != __last)
1964 {
1965 std::__introsort_loop(__first, __last,
1966 std::__lg(__last - __first) * 2,
1967 __comp);
1968 std::__final_insertion_sort(__first, __last, __comp);
1969 }
1970 }
1971
1972 template<typename _RandomAccessIterator, typename _Size, typename _Compare>
1973 void
1974 __introselect(_RandomAccessIterator __first, _RandomAccessIterator __nth,
1975 _RandomAccessIterator __last, _Size __depth_limit,
1976 _Compare __comp)
1977 {
1978 while (__last - __first > 3)
1979 {
1980 if (__depth_limit == 0)
1981 {
1982 std::__heap_select(__first, __nth + 1, __last, __comp);
1983 // Place the nth largest element in its final position.
1984 std::iter_swap(__first, __nth);
1985 return;
1986 }
1987 --__depth_limit;
1988 _RandomAccessIterator __cut =
1989 std::__unguarded_partition_pivot(__first, __last, __comp);
1990 if (__cut <= __nth)
1991 __first = __cut;
1992 else
1993 __last = __cut;
1994 }
1995 std::__insertion_sort(__first, __last, __comp);
1996 }
1997
1998 // nth_element
1999
2000 // lower_bound moved to stl_algobase.h
2001
2002 /**
2003 * @brief Finds the first position in which @p __val could be inserted
2004 * without changing the ordering.
2005 * @ingroup binary_search_algorithms
2006 * @param __first An iterator.
2007 * @param __last Another iterator.
2008 * @param __val The search term.
2009 * @param __comp A functor to use for comparisons.
2010 * @return An iterator pointing to the first element <em>not less
2011 * than</em> @p __val, or end() if every element is less
2012 * than @p __val.
2013 * @ingroup binary_search_algorithms
2014 *
2015 * The comparison function should have the same effects on ordering as
2016 * the function used for the initial sort.
2017 */
2018 template<typename _ForwardIterator, typename _Tp, typename _Compare>
2019 inline _ForwardIterator
2020 lower_bound(_ForwardIterator __first, _ForwardIterator __last,
2021 const _Tp& __val, _Compare __comp)
2022 {
2023 // concept requirements
2024 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2025 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2026 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
2027 __glibcxx_requires_partitioned_lower_pred(__first, __last,
2028 __val, __comp);
2029
2030 return std::__lower_bound(__first, __last, __val,
2031 __gnu_cxx::__ops::__iter_comp_val(__comp));
2032 }
2033
2034 template<typename _ForwardIterator, typename _Tp, typename _Compare>
2035 _ForwardIterator
2036 __upper_bound(_ForwardIterator __first, _ForwardIterator __last,
2037 const _Tp& __val, _Compare __comp)
2038 {
2039 typedef typename iterator_traits<_ForwardIterator>::difference_type
2040 _DistanceType;
2041
2042 _DistanceType __len = std::distance(__first, __last);
2043
2044 while (__len > 0)
2045 {
2046 _DistanceType __half = __len >> 1;
2047 _ForwardIterator __middle = __first;
2048 std::advance(__middle, __half);
2049 if (__comp(__val, __middle))
2050 __len = __half;
2051 else
2052 {
2053 __first = __middle;
2054 ++__first;
2055 __len = __len - __half - 1;
2056 }
2057 }
2058 return __first;
2059 }
2060
2061 /**
2062 * @brief Finds the last position in which @p __val could be inserted
2063 * without changing the ordering.
2064 * @ingroup binary_search_algorithms
2065 * @param __first An iterator.
2066 * @param __last Another iterator.
2067 * @param __val The search term.
2068 * @return An iterator pointing to the first element greater than @p __val,
2069 * or end() if no elements are greater than @p __val.
2070 * @ingroup binary_search_algorithms
2071 */
2072 template<typename _ForwardIterator, typename _Tp>
2073 inline _ForwardIterator
2074 upper_bound(_ForwardIterator __first, _ForwardIterator __last,
2075 const _Tp& __val)
2076 {
2077 // concept requirements
2078 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2079 __glibcxx_function_requires(_LessThanOpConcept<
2080 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2081 __glibcxx_requires_partitioned_upper(__first, __last, __val);
2082
2083 return std::__upper_bound(__first, __last, __val,
2084 __gnu_cxx::__ops::__val_less_iter());
2085 }
2086
2087 /**
2088 * @brief Finds the last position in which @p __val could be inserted
2089 * without changing the ordering.
2090 * @ingroup binary_search_algorithms
2091 * @param __first An iterator.
2092 * @param __last Another iterator.
2093 * @param __val The search term.
2094 * @param __comp A functor to use for comparisons.
2095 * @return An iterator pointing to the first element greater than @p __val,
2096 * or end() if no elements are greater than @p __val.
2097 * @ingroup binary_search_algorithms
2098 *
2099 * The comparison function should have the same effects on ordering as
2100 * the function used for the initial sort.
2101 */
2102 template<typename _ForwardIterator, typename _Tp, typename _Compare>
2103 inline _ForwardIterator
2104 upper_bound(_ForwardIterator __first, _ForwardIterator __last,
2105 const _Tp& __val, _Compare __comp)
2106 {
2107 // concept requirements
2108 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2109 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2110 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2111 __glibcxx_requires_partitioned_upper_pred(__first, __last,
2112 __val, __comp);
2113
2114 return std::__upper_bound(__first, __last, __val,
2115 __gnu_cxx::__ops::__val_comp_iter(__comp));
2116 }
2117
2118 template<typename _ForwardIterator, typename _Tp,
2119 typename _CompareItTp, typename _CompareTpIt>
2120 pair<_ForwardIterator, _ForwardIterator>
2121 __equal_range(_ForwardIterator __first, _ForwardIterator __last,
2122 const _Tp& __val,
2123 _CompareItTp __comp_it_val, _CompareTpIt __comp_val_it)
2124 {
2125 typedef typename iterator_traits<_ForwardIterator>::difference_type
2126 _DistanceType;
2127
2128 _DistanceType __len = std::distance(__first, __last);
2129
2130 while (__len > 0)
2131 {
2132 _DistanceType __half = __len >> 1;
2133 _ForwardIterator __middle = __first;
2134 std::advance(__middle, __half);
2135 if (__comp_it_val(__middle, __val))
2136 {
2137 __first = __middle;
2138 ++__first;
2139 __len = __len - __half - 1;
2140 }
2141 else if (__comp_val_it(__val, __middle))
2142 __len = __half;
2143 else
2144 {
2145 _ForwardIterator __left
2146 = std::__lower_bound(__first, __middle, __val, __comp_it_val);
2147 std::advance(__first, __len);
2148 _ForwardIterator __right
2149 = std::__upper_bound(++__middle, __first, __val, __comp_val_it);
2150 return pair<_ForwardIterator, _ForwardIterator>(__left, __right);
2151 }
2152 }
2153 return pair<_ForwardIterator, _ForwardIterator>(__first, __first);
2154 }
2155
2156 /**
2157 * @brief Finds the largest subrange in which @p __val could be inserted
2158 * at any place in it without changing the ordering.
2159 * @ingroup binary_search_algorithms
2160 * @param __first An iterator.
2161 * @param __last Another iterator.
2162 * @param __val The search term.
2163 * @return An pair of iterators defining the subrange.
2164 * @ingroup binary_search_algorithms
2165 *
2166 * This is equivalent to
2167 * @code
2168 * std::make_pair(lower_bound(__first, __last, __val),
2169 * upper_bound(__first, __last, __val))
2170 * @endcode
2171 * but does not actually call those functions.
2172 */
2173 template<typename _ForwardIterator, typename _Tp>
2174 inline pair<_ForwardIterator, _ForwardIterator>
2175 equal_range(_ForwardIterator __first, _ForwardIterator __last,
2176 const _Tp& __val)
2177 {
2178 // concept requirements
2179 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2180 __glibcxx_function_requires(_LessThanOpConcept<
2181 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
2182 __glibcxx_function_requires(_LessThanOpConcept<
2183 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2184 __glibcxx_requires_partitioned_lower(__first, __last, __val);
2185 __glibcxx_requires_partitioned_upper(__first, __last, __val);
2186
2187 return std::__equal_range(__first, __last, __val,
2188 __gnu_cxx::__ops::__iter_less_val(),
2189 __gnu_cxx::__ops::__val_less_iter());
2190 }
2191
2192 /**
2193 * @brief Finds the largest subrange in which @p __val could be inserted
2194 * at any place in it without changing the ordering.
2195 * @param __first An iterator.
2196 * @param __last Another iterator.
2197 * @param __val The search term.
2198 * @param __comp A functor to use for comparisons.
2199 * @return An pair of iterators defining the subrange.
2200 * @ingroup binary_search_algorithms
2201 *
2202 * This is equivalent to
2203 * @code
2204 * std::make_pair(lower_bound(__first, __last, __val, __comp),
2205 * upper_bound(__first, __last, __val, __comp))
2206 * @endcode
2207 * but does not actually call those functions.
2208 */
2209 template<typename _ForwardIterator, typename _Tp, typename _Compare>
2210 inline pair<_ForwardIterator, _ForwardIterator>
2211 equal_range(_ForwardIterator __first, _ForwardIterator __last,
2212 const _Tp& __val, _Compare __comp)
2213 {
2214 // concept requirements
2215 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2216 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2217 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
2218 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2219 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2220 __glibcxx_requires_partitioned_lower_pred(__first, __last,
2221 __val, __comp);
2222 __glibcxx_requires_partitioned_upper_pred(__first, __last,
2223 __val, __comp);
2224
2225 return std::__equal_range(__first, __last, __val,
2226 __gnu_cxx::__ops::__iter_comp_val(__comp),
2227 __gnu_cxx::__ops::__val_comp_iter(__comp));
2228 }
2229
2230 /**
2231 * @brief Determines whether an element exists in a range.
2232 * @ingroup binary_search_algorithms
2233 * @param __first An iterator.
2234 * @param __last Another iterator.
2235 * @param __val The search term.
2236 * @return True if @p __val (or its equivalent) is in [@p
2237 * __first,@p __last ].
2238 *
2239 * Note that this does not actually return an iterator to @p __val. For
2240 * that, use std::find or a container's specialized find member functions.
2241 */
2242 template<typename _ForwardIterator, typename _Tp>
2243 bool
2244 binary_search(_ForwardIterator __first, _ForwardIterator __last,
2245 const _Tp& __val)
2246 {
2247 // concept requirements
2248 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2249 __glibcxx_function_requires(_LessThanOpConcept<
2250 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2251 __glibcxx_requires_partitioned_lower(__first, __last, __val);
2252 __glibcxx_requires_partitioned_upper(__first, __last, __val);
2253
2254 _ForwardIterator __i
2255 = std::__lower_bound(__first, __last, __val,
2256 __gnu_cxx::__ops::__iter_less_val());
2257 return __i != __last && !(__val < *__i);
2258 }
2259
2260 /**
2261 * @brief Determines whether an element exists in a range.
2262 * @ingroup binary_search_algorithms
2263 * @param __first An iterator.
2264 * @param __last Another iterator.
2265 * @param __val The search term.
2266 * @param __comp A functor to use for comparisons.
2267 * @return True if @p __val (or its equivalent) is in @p [__first,__last].
2268 *
2269 * Note that this does not actually return an iterator to @p __val. For
2270 * that, use std::find or a container's specialized find member functions.
2271 *
2272 * The comparison function should have the same effects on ordering as
2273 * the function used for the initial sort.
2274 */
2275 template<typename _ForwardIterator, typename _Tp, typename _Compare>
2276 bool
2277 binary_search(_ForwardIterator __first, _ForwardIterator __last,
2278 const _Tp& __val, _Compare __comp)
2279 {
2280 // concept requirements
2281 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2282 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2283 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2284 __glibcxx_requires_partitioned_lower_pred(__first, __last,
2285 __val, __comp);
2286 __glibcxx_requires_partitioned_upper_pred(__first, __last,
2287 __val, __comp);
2288
2289 _ForwardIterator __i
2290 = std::__lower_bound(__first, __last, __val,
2291 __gnu_cxx::__ops::__iter_comp_val(__comp));
2292 return __i != __last && !bool(__comp(__val, *__i));
2293 }
2294
2295 // merge
2296
2297 /// This is a helper function for the __merge_adaptive routines.
2298 template<typename _InputIterator1, typename _InputIterator2,
2299 typename _OutputIterator, typename _Compare>
2300 void
2301 __move_merge_adaptive(_InputIterator1 __first1, _InputIterator1 __last1,
2302 _InputIterator2 __first2, _InputIterator2 __last2,
2303 _OutputIterator __result, _Compare __comp)
2304 {
2305 while (__first1 != __last1 && __first2 != __last2)
2306 {
2307 if (__comp(__first2, __first1))
2308 {
2309 *__result = _GLIBCXX_MOVE(*__first2)std::move(*__first2);
2310 ++__first2;
2311 }
2312 else
2313 {
2314 *__result = _GLIBCXX_MOVE(*__first1)std::move(*__first1);
2315 ++__first1;
2316 }
2317 ++__result;
2318 }
2319 if (__first1 != __last1)
2320 _GLIBCXX_MOVE3(__first1, __last1, __result)std::move(__first1, __last1, __result);
2321 }
2322
2323 /// This is a helper function for the __merge_adaptive routines.
2324 template<typename _BidirectionalIterator1, typename _BidirectionalIterator2,
2325 typename _BidirectionalIterator3, typename _Compare>
2326 void
2327 __move_merge_adaptive_backward(_BidirectionalIterator1 __first1,
2328 _BidirectionalIterator1 __last1,
2329 _BidirectionalIterator2 __first2,
2330 _BidirectionalIterator2 __last2,
2331 _BidirectionalIterator3 __result,
2332 _Compare __comp)
2333 {
2334 if (__first1 == __last1)
2335 {
2336 _GLIBCXX_MOVE_BACKWARD3(__first2, __last2, __result)std::move_backward(__first2, __last2, __result);
2337 return;
2338 }
2339 else if (__first2 == __last2)
2340 return;
2341
2342 --__last1;
2343 --__last2;
2344 while (true)
2345 {
2346 if (__comp(__last2, __last1))
2347 {
2348 *--__result = _GLIBCXX_MOVE(*__last1)std::move(*__last1);
2349 if (__first1 == __last1)
2350 {
2351 _GLIBCXX_MOVE_BACKWARD3(__first2, ++__last2, __result)std::move_backward(__first2, ++__last2, __result);
2352 return;
2353 }
2354 --__last1;
2355 }
2356 else
2357 {
2358 *--__result = _GLIBCXX_MOVE(*__last2)std::move(*__last2);
2359 if (__first2 == __last2)
2360 return;
2361 --__last2;
2362 }
2363 }
2364 }
2365
2366 /// This is a helper function for the merge routines.
2367 template<typename _BidirectionalIterator1, typename _BidirectionalIterator2,
2368 typename _Distance>
2369 _BidirectionalIterator1
2370 __rotate_adaptive(_BidirectionalIterator1 __first,
2371 _BidirectionalIterator1 __middle,
2372 _BidirectionalIterator1 __last,
2373 _Distance __len1, _Distance __len2,
2374 _BidirectionalIterator2 __buffer,
2375 _Distance __buffer_size)
2376 {
2377 _BidirectionalIterator2 __buffer_end;
2378 if (__len1 > __len2 && __len2 <= __buffer_size)
2379 {
2380 if (__len2)
2381 {
2382 __buffer_end = _GLIBCXX_MOVE3(__middle, __last, __buffer)std::move(__middle, __last, __buffer);
2383 _GLIBCXX_MOVE_BACKWARD3(__first, __middle, __last)std::move_backward(__first, __middle, __last);
2384 return _GLIBCXX_MOVE3(__buffer, __buffer_end, __first)std::move(__buffer, __buffer_end, __first);
2385 }
2386 else
2387 return __first;
2388 }
2389 else if (__len1 <= __buffer_size)
2390 {
2391 if (__len1)
2392 {
2393 __buffer_end = _GLIBCXX_MOVE3(__first, __middle, __buffer)std::move(__first, __middle, __buffer);
2394 _GLIBCXX_MOVE3(__middle, __last, __first)std::move(__middle, __last, __first);
2395 return _GLIBCXX_MOVE_BACKWARD3(__buffer, __buffer_end, __last)std::move_backward(__buffer, __buffer_end, __last);
2396 }
2397 else
2398 return __last;
2399 }
2400 else
2401 {
2402 std::rotate(__first, __middle, __last);
2403 std::advance(__first, std::distance(__middle, __last));
2404 return __first;
2405 }
2406 }
2407
2408 /// This is a helper function for the merge routines.
2409 template<typename _BidirectionalIterator, typename _Distance,
2410 typename _Pointer, typename _Compare>
2411 void
2412 __merge_adaptive(_BidirectionalIterator __first,
2413 _BidirectionalIterator __middle,
2414 _BidirectionalIterator __last,
2415 _Distance __len1, _Distance __len2,
2416 _Pointer __buffer, _Distance __buffer_size,
2417 _Compare __comp)
2418 {
2419 if (__len1 <= __len2 && __len1 <= __buffer_size)
2420 {
2421 _Pointer __buffer_end = _GLIBCXX_MOVE3(__first, __middle, __buffer)std::move(__first, __middle, __buffer);
2422 std::__move_merge_adaptive(__buffer, __buffer_end, __middle, __last,
2423 __first, __comp);
2424 }
2425 else if (__len2 <= __buffer_size)
2426 {
2427 _Pointer __buffer_end = _GLIBCXX_MOVE3(__middle, __last, __buffer)std::move(__middle, __last, __buffer);
2428 std::__move_merge_adaptive_backward(__first, __middle, __buffer,
2429 __buffer_end, __last, __comp);
2430 }
2431 else
2432 {
2433 _BidirectionalIterator __first_cut = __first;
2434 _BidirectionalIterator __second_cut = __middle;
2435 _Distance __len11 = 0;
2436 _Distance __len22 = 0;
2437 if (__len1 > __len2)
2438 {
2439 __len11 = __len1 / 2;
2440 std::advance(__first_cut, __len11);
2441 __second_cut
2442 = std::__lower_bound(__middle, __last, *__first_cut,
2443 __gnu_cxx::__ops::__iter_comp_val(__comp));
2444 __len22 = std::distance(__middle, __second_cut);
2445 }
2446 else
2447 {
2448 __len22 = __len2 / 2;
2449 std::advance(__second_cut, __len22);
2450 __first_cut
2451 = std::__upper_bound(__first, __middle, *__second_cut,
2452 __gnu_cxx::__ops::__val_comp_iter(__comp));
2453 __len11 = std::distance(__first, __first_cut);
2454 }
2455
2456 _BidirectionalIterator __new_middle
2457 = std::__rotate_adaptive(__first_cut, __middle, __second_cut,
2458 __len1 - __len11, __len22, __buffer,
2459 __buffer_size);
2460 std::__merge_adaptive(__first, __first_cut, __new_middle, __len11,
2461 __len22, __buffer, __buffer_size, __comp);
2462 std::__merge_adaptive(__new_middle, __second_cut, __last,
2463 __len1 - __len11,
2464 __len2 - __len22, __buffer,
2465 __buffer_size, __comp);
2466 }
2467 }
2468
2469 /// This is a helper function for the merge routines.
2470 template<typename _BidirectionalIterator, typename _Distance,
2471 typename _Compare>
2472 void
2473 __merge_without_buffer(_BidirectionalIterator __first,
2474 _BidirectionalIterator __middle,
2475 _BidirectionalIterator __last,
2476 _Distance __len1, _Distance __len2,
2477 _Compare __comp)
2478 {
2479 if (__len1 == 0 || __len2 == 0)
2480 return;
2481
2482 if (__len1 + __len2 == 2)
2483 {
2484 if (__comp(__middle, __first))
2485 std::iter_swap(__first, __middle);
2486 return;
2487 }
2488
2489 _BidirectionalIterator __first_cut = __first;
2490 _BidirectionalIterator __second_cut = __middle;
2491 _Distance __len11 = 0;
2492 _Distance __len22 = 0;
2493 if (__len1 > __len2)
2494 {
2495 __len11 = __len1 / 2;
2496 std::advance(__first_cut, __len11);
2497 __second_cut
2498 = std::__lower_bound(__middle, __last, *__first_cut,
2499 __gnu_cxx::__ops::__iter_comp_val(__comp));
2500 __len22 = std::distance(__middle, __second_cut);
2501 }
2502 else
2503 {
2504 __len22 = __len2 / 2;
2505 std::advance(__second_cut, __len22);
2506 __first_cut
2507 = std::__upper_bound(__first, __middle, *__second_cut,
2508 __gnu_cxx::__ops::__val_comp_iter(__comp));
2509 __len11 = std::distance(__first, __first_cut);
2510 }
2511
2512 std::rotate(__first_cut, __middle, __second_cut);
2513 _BidirectionalIterator __new_middle = __first_cut;
2514 std::advance(__new_middle, std::distance(__middle, __second_cut));
2515 std::__merge_without_buffer(__first, __first_cut, __new_middle,
2516 __len11, __len22, __comp);
2517 std::__merge_without_buffer(__new_middle, __second_cut, __last,
2518 __len1 - __len11, __len2 - __len22, __comp);
2519 }
2520
2521 template<typename _BidirectionalIterator, typename _Compare>
2522 void
2523 __inplace_merge(_BidirectionalIterator __first,
2524 _BidirectionalIterator __middle,
2525 _BidirectionalIterator __last,
2526 _Compare __comp)
2527 {
2528 typedef typename iterator_traits<_BidirectionalIterator>::value_type
2529 _ValueType;
2530 typedef typename iterator_traits<_BidirectionalIterator>::difference_type
2531 _DistanceType;
2532
2533 if (__first == __middle || __middle == __last)
2534 return;
2535
2536 const _DistanceType __len1 = std::distance(__first, __middle);
2537 const _DistanceType __len2 = std::distance(__middle, __last);
2538
2539 typedef _Temporary_buffer<_BidirectionalIterator, _ValueType> _TmpBuf;
2540 _TmpBuf __buf(__first, __last);
2541
2542 if (__buf.begin() == 0)
2543 std::__merge_without_buffer
2544 (__first, __middle, __last, __len1, __len2, __comp);
2545 else
2546 std::__merge_adaptive
2547 (__first, __middle, __last, __len1, __len2, __buf.begin(),
2548 _DistanceType(__buf.size()), __comp);
2549 }
2550
2551 /**
2552 * @brief Merges two sorted ranges in place.
2553 * @ingroup sorting_algorithms
2554 * @param __first An iterator.
2555 * @param __middle Another iterator.
2556 * @param __last Another iterator.
2557 * @return Nothing.
2558 *
2559 * Merges two sorted and consecutive ranges, [__first,__middle) and
2560 * [__middle,__last), and puts the result in [__first,__last). The
2561 * output will be sorted. The sort is @e stable, that is, for
2562 * equivalent elements in the two ranges, elements from the first
2563 * range will always come before elements from the second.
2564 *
2565 * If enough additional memory is available, this takes (__last-__first)-1
2566 * comparisons. Otherwise an NlogN algorithm is used, where N is
2567 * distance(__first,__last).
2568 */
2569 template<typename _BidirectionalIterator>
2570 inline void
2571 inplace_merge(_BidirectionalIterator __first,
2572 _BidirectionalIterator __middle,
2573 _BidirectionalIterator __last)
2574 {
2575 // concept requirements
2576 __glibcxx_function_requires(_Mutable_BidirectionalIteratorConcept<
2577 _BidirectionalIterator>)
2578 __glibcxx_function_requires(_LessThanComparableConcept<
2579 typename iterator_traits<_BidirectionalIterator>::value_type>)
2580 __glibcxx_requires_sorted(__first, __middle);
2581 __glibcxx_requires_sorted(__middle, __last);
2582 __glibcxx_requires_irreflexive(__first, __last);
2583
2584 std::__inplace_merge(__first, __middle, __last,
2585 __gnu_cxx::__ops::__iter_less_iter());
2586 }
2587
2588 /**
2589 * @brief Merges two sorted ranges in place.
2590 * @ingroup sorting_algorithms
2591 * @param __first An iterator.
2592 * @param __middle Another iterator.
2593 * @param __last Another iterator.
2594 * @param __comp A functor to use for comparisons.
2595 * @return Nothing.
2596 *
2597 * Merges two sorted and consecutive ranges, [__first,__middle) and
2598 * [middle,last), and puts the result in [__first,__last). The output will
2599 * be sorted. The sort is @e stable, that is, for equivalent
2600 * elements in the two ranges, elements from the first range will always
2601 * come before elements from the second.
2602 *
2603 * If enough additional memory is available, this takes (__last-__first)-1
2604 * comparisons. Otherwise an NlogN algorithm is used, where N is
2605 * distance(__first,__last).
2606 *
2607 * The comparison function should have the same effects on ordering as
2608 * the function used for the initial sort.
2609 */
2610 template<typename _BidirectionalIterator, typename _Compare>
2611 inline void
2612 inplace_merge(_BidirectionalIterator __first,
2613 _BidirectionalIterator __middle,
2614 _BidirectionalIterator __last,
2615 _Compare __comp)
2616 {
2617 // concept requirements
2618 __glibcxx_function_requires(_Mutable_BidirectionalIteratorConcept<
2619 _BidirectionalIterator>)
2620 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2621 typename iterator_traits<_BidirectionalIterator>::value_type,
2622 typename iterator_traits<_BidirectionalIterator>::value_type>)
2623 __glibcxx_requires_sorted_pred(__first, __middle, __comp);
2624 __glibcxx_requires_sorted_pred(__middle, __last, __comp);
2625 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
2626
2627 std::__inplace_merge(__first, __middle, __last,
2628 __gnu_cxx::__ops::__iter_comp_iter(__comp));
2629 }
2630
2631
2632 /// This is a helper function for the __merge_sort_loop routines.
2633 template<typename _InputIterator, typename _OutputIterator,
2634 typename _Compare>
2635 _OutputIterator
2636 __move_merge(_InputIterator __first1, _InputIterator __last1,
2637 _InputIterator __first2, _InputIterator __last2,
2638 _OutputIterator __result, _Compare __comp)
2639 {
2640 while (__first1 != __last1 && __first2 != __last2)
2641 {
2642 if (__comp(__first2, __first1))
2643 {
2644 *__result = _GLIBCXX_MOVE(*__first2)std::move(*__first2);
2645 ++__first2;
2646 }
2647 else
2648 {
2649 *__result = _GLIBCXX_MOVE(*__first1)std::move(*__first1);
2650 ++__first1;
2651 }
2652 ++__result;
2653 }
2654 return _GLIBCXX_MOVE3(__first2, __last2,std::move(__first2, __last2, std::move(__first1, __last1, __result
))
2655 _GLIBCXX_MOVE3(__first1, __last1,std::move(__first2, __last2, std::move(__first1, __last1, __result
))
2656 __result))std::move(__first2, __last2, std::move(__first1, __last1, __result
))
;
2657 }
2658
2659 template<typename _RandomAccessIterator1, typename _RandomAccessIterator2,
2660 typename _Distance, typename _Compare>
2661 void
2662 __merge_sort_loop(_RandomAccessIterator1 __first,
2663 _RandomAccessIterator1 __last,
2664 _RandomAccessIterator2 __result, _Distance __step_size,
2665 _Compare __comp)
2666 {
2667 const _Distance __two_step = 2 * __step_size;
2668
2669 while (__last - __first >= __two_step)
2670 {
2671 __result = std::__move_merge(__first, __first + __step_size,
2672 __first + __step_size,
2673 __first + __two_step,
2674 __result, __comp);
2675 __first += __two_step;
2676 }
2677 __step_size = std::min(_Distance(__last - __first), __step_size);
2678
2679 std::__move_merge(__first, __first + __step_size,
2680 __first + __step_size, __last, __result, __comp);
2681 }
2682
2683 template<typename _RandomAccessIterator, typename _Distance,
2684 typename _Compare>
2685 void
2686 __chunk_insertion_sort(_RandomAccessIterator __first,
2687 _RandomAccessIterator __last,
2688 _Distance __chunk_size, _Compare __comp)
2689 {
2690 while (__last - __first >= __chunk_size)
2691 {
2692 std::__insertion_sort(__first, __first + __chunk_size, __comp);
2693 __first += __chunk_size;
2694 }
2695 std::__insertion_sort(__first, __last, __comp);
2696 }
2697
2698 enum { _S_chunk_size = 7 };
2699
2700 template<typename _RandomAccessIterator, typename _Pointer, typename _Compare>
2701 void
2702 __merge_sort_with_buffer(_RandomAccessIterator __first,
2703 _RandomAccessIterator __last,
2704 _Pointer __buffer, _Compare __comp)
2705 {
2706 typedef typename iterator_traits<_RandomAccessIterator>::difference_type
2707 _Distance;
2708
2709 const _Distance __len = __last - __first;
2710 const _Pointer __buffer_last = __buffer + __len;
2711
2712 _Distance __step_size = _S_chunk_size;
2713 std::__chunk_insertion_sort(__first, __last, __step_size, __comp);
2714
2715 while (__step_size < __len)
2716 {
2717 std::__merge_sort_loop(__first, __last, __buffer,
2718 __step_size, __comp);
2719 __step_size *= 2;
2720 std::__merge_sort_loop(__buffer, __buffer_last, __first,
2721 __step_size, __comp);
2722 __step_size *= 2;
2723 }
2724 }
2725
2726 template<typename _RandomAccessIterator, typename _Pointer,
2727 typename _Distance, typename _Compare>
2728 void
2729 __stable_sort_adaptive(_RandomAccessIterator __first,
2730 _RandomAccessIterator __last,
2731 _Pointer __buffer, _Distance __buffer_size,
2732 _Compare __comp)
2733 {
2734 const _Distance __len = (__last - __first + 1) / 2;
2735 const _RandomAccessIterator __middle = __first + __len;
2736 if (__len > __buffer_size)
2737 {
2738 std::__stable_sort_adaptive(__first, __middle, __buffer,
2739 __buffer_size, __comp);
2740 std::__stable_sort_adaptive(__middle, __last, __buffer,
2741 __buffer_size, __comp);
2742 }
2743 else
2744 {
2745 std::__merge_sort_with_buffer(__first, __middle, __buffer, __comp);
2746 std::__merge_sort_with_buffer(__middle, __last, __buffer, __comp);
2747 }
2748 std::__merge_adaptive(__first, __middle, __last,
2749 _Distance(__middle - __first),
2750 _Distance(__last - __middle),
2751 __buffer, __buffer_size,
2752 __comp);
2753 }
2754
2755 /// This is a helper function for the stable sorting routines.
2756 template<typename _RandomAccessIterator, typename _Compare>
2757 void
2758 __inplace_stable_sort(_RandomAccessIterator __first,
2759 _RandomAccessIterator __last, _Compare __comp)
2760 {
2761 if (__last - __first < 15)
2762 {
2763 std::__insertion_sort(__first, __last, __comp);
2764 return;
2765 }
2766 _RandomAccessIterator __middle = __first + (__last - __first) / 2;
2767 std::__inplace_stable_sort(__first, __middle, __comp);
2768 std::__inplace_stable_sort(__middle, __last, __comp);
2769 std::__merge_without_buffer(__first, __middle, __last,
2770 __middle - __first,
2771 __last - __middle,
2772 __comp);
2773 }
2774
2775 // stable_sort
2776
2777 // Set algorithms: includes, set_union, set_intersection, set_difference,
2778 // set_symmetric_difference. All of these algorithms have the precondition
2779 // that their input ranges are sorted and the postcondition that their output
2780 // ranges are sorted.
2781
2782 template<typename _InputIterator1, typename _InputIterator2,
2783 typename _Compare>
2784 bool
2785 __includes(_InputIterator1 __first1, _InputIterator1 __last1,
2786 _InputIterator2 __first2, _InputIterator2 __last2,
2787 _Compare __comp)
2788 {
2789 while (__first1 != __last1 && __first2 != __last2)
2790 if (__comp(__first2, __first1))
2791 return false;
2792 else if (__comp(__first1, __first2))
2793 ++__first1;
2794 else
2795 {
2796 ++__first1;
2797 ++__first2;
2798 }
2799
2800 return __first2 == __last2;
2801 }
2802
2803 /**
2804 * @brief Determines whether all elements of a sequence exists in a range.
2805 * @param __first1 Start of search range.
2806 * @param __last1 End of search range.
2807 * @param __first2 Start of sequence
2808 * @param __last2 End of sequence.
2809 * @return True if each element in [__first2,__last2) is contained in order
2810 * within [__first1,__last1). False otherwise.
2811 * @ingroup set_algorithms
2812 *
2813 * This operation expects both [__first1,__last1) and
2814 * [__first2,__last2) to be sorted. Searches for the presence of
2815 * each element in [__first2,__last2) within [__first1,__last1).
2816 * The iterators over each range only move forward, so this is a
2817 * linear algorithm. If an element in [__first2,__last2) is not
2818 * found before the search iterator reaches @p __last2, false is
2819 * returned.
2820 */
2821 template<typename _InputIterator1, typename _InputIterator2>
2822 inline bool
2823 includes(_InputIterator1 __first1, _InputIterator1 __last1,
2824 _InputIterator2 __first2, _InputIterator2 __last2)
2825 {
2826 // concept requirements
2827 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
2828 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
2829 __glibcxx_function_requires(_LessThanOpConcept<
2830 typename iterator_traits<_InputIterator1>::value_type,
2831 typename iterator_traits<_InputIterator2>::value_type>)
2832 __glibcxx_function_requires(_LessThanOpConcept<
2833 typename iterator_traits<_InputIterator2>::value_type,
2834 typename iterator_traits<_InputIterator1>::value_type>)
2835 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
2836 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
2837 __glibcxx_requires_irreflexive2(__first1, __last1);
2838 __glibcxx_requires_irreflexive2(__first2, __last2);
2839
2840 return std::__includes(__first1, __last1, __first2, __last2,
2841 __gnu_cxx::__ops::__iter_less_iter());
2842 }
2843
2844 /**
2845 * @brief Determines whether all elements of a sequence exists in a range
2846 * using comparison.
2847 * @ingroup set_algorithms
2848 * @param __first1 Start of search range.
2849 * @param __last1 End of search range.
2850 * @param __first2 Start of sequence
2851 * @param __last2 End of sequence.
2852 * @param __comp Comparison function to use.
2853 * @return True if each element in [__first2,__last2) is contained
2854 * in order within [__first1,__last1) according to comp. False
2855 * otherwise. @ingroup set_algorithms
2856 *
2857 * This operation expects both [__first1,__last1) and
2858 * [__first2,__last2) to be sorted. Searches for the presence of
2859 * each element in [__first2,__last2) within [__first1,__last1),
2860 * using comp to decide. The iterators over each range only move
2861 * forward, so this is a linear algorithm. If an element in
2862 * [__first2,__last2) is not found before the search iterator
2863 * reaches @p __last2, false is returned.
2864 */
2865 template<typename _InputIterator1, typename _InputIterator2,
2866 typename _Compare>
2867 inline bool
2868 includes(_InputIterator1 __first1, _InputIterator1 __last1,
2869 _InputIterator2 __first2, _InputIterator2 __last2,
2870 _Compare __comp)
2871 {
2872 // concept requirements
2873 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
2874 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
2875 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2876 typename iterator_traits<_InputIterator1>::value_type,
2877 typename iterator_traits<_InputIterator2>::value_type>)
2878 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2879 typename iterator_traits<_InputIterator2>::value_type,
2880 typename iterator_traits<_InputIterator1>::value_type>)
2881 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
2882 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
2883 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
2884 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
2885
2886 return std::__includes(__first1, __last1, __first2, __last2,
2887 __gnu_cxx::__ops::__iter_comp_iter(__comp));
2888 }
2889
2890 // nth_element
2891 // merge
2892 // set_difference
2893 // set_intersection
2894 // set_union
2895 // stable_sort
2896 // set_symmetric_difference
2897 // min_element
2898 // max_element
2899
2900 template<typename _BidirectionalIterator, typename _Compare>
2901 bool
2902 __next_permutation(_BidirectionalIterator __first,
2903 _BidirectionalIterator __last, _Compare __comp)
2904 {
2905 if (__first == __last)
2906 return false;
2907 _BidirectionalIterator __i = __first;
2908 ++__i;
2909 if (__i == __last)
2910 return false;
2911 __i = __last;
2912 --__i;
2913
2914 for(;;)
2915 {
2916 _BidirectionalIterator __ii = __i;
2917 --__i;
2918 if (__comp(__i, __ii))
2919 {
2920 _BidirectionalIterator __j = __last;
2921 while (!__comp(__i, --__j))
2922 {}
2923 std::iter_swap(__i, __j);
2924 std::__reverse(__ii, __last,
2925 std::__iterator_category(__first));
2926 return true;
2927 }
2928 if (__i == __first)
2929 {
2930 std::__reverse(__first, __last,
2931 std::__iterator_category(__first));
2932 return false;
2933 }
2934 }
2935 }
2936
2937 /**
2938 * @brief Permute range into the next @e dictionary ordering.
2939 * @ingroup sorting_algorithms
2940 * @param __first Start of range.
2941 * @param __last End of range.
2942 * @return False if wrapped to first permutation, true otherwise.
2943 *
2944 * Treats all permutations of the range as a set of @e dictionary sorted
2945 * sequences. Permutes the current sequence into the next one of this set.
2946 * Returns true if there are more sequences to generate. If the sequence
2947 * is the largest of the set, the smallest is generated and false returned.
2948 */
2949 template<typename _BidirectionalIterator>
2950 inline bool
2951 next_permutation(_BidirectionalIterator __first,
2952 _BidirectionalIterator __last)
2953 {
2954 // concept requirements
2955 __glibcxx_function_requires(_BidirectionalIteratorConcept<
2956 _BidirectionalIterator>)
2957 __glibcxx_function_requires(_LessThanComparableConcept<
2958 typename iterator_traits<_BidirectionalIterator>::value_type>)
2959 __glibcxx_requires_valid_range(__first, __last);
2960 __glibcxx_requires_irreflexive(__first, __last);
2961
2962 return std::__next_permutation
2963 (__first, __last, __gnu_cxx::__ops::__iter_less_iter());
2964 }
2965
2966 /**
2967 * @brief Permute range into the next @e dictionary ordering using
2968 * comparison functor.
2969 * @ingroup sorting_algorithms
2970 * @param __first Start of range.
2971 * @param __last End of range.
2972 * @param __comp A comparison functor.
2973 * @return False if wrapped to first permutation, true otherwise.
2974 *
2975 * Treats all permutations of the range [__first,__last) as a set of
2976 * @e dictionary sorted sequences ordered by @p __comp. Permutes the current
2977 * sequence into the next one of this set. Returns true if there are more
2978 * sequences to generate. If the sequence is the largest of the set, the
2979 * smallest is generated and false returned.
2980 */
2981 template<typename _BidirectionalIterator, typename _Compare>
2982 inline bool
2983 next_permutation(_BidirectionalIterator __first,
2984 _BidirectionalIterator __last, _Compare __comp)
2985 {
2986 // concept requirements
2987 __glibcxx_function_requires(_BidirectionalIteratorConcept<
2988 _BidirectionalIterator>)
2989 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2990 typename iterator_traits<_BidirectionalIterator>::value_type,
2991 typename iterator_traits<_BidirectionalIterator>::value_type>)
2992 __glibcxx_requires_valid_range(__first, __last);
2993 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
2994
2995 return std::__next_permutation
2996 (__first, __last, __gnu_cxx::__ops::__iter_comp_iter(__comp));
2997 }
2998
2999 template<typename _BidirectionalIterator, typename _Compare>
3000 bool
3001 __prev_permutation(_BidirectionalIterator __first,
3002 _BidirectionalIterator __last, _Compare __comp)
3003 {
3004 if (__first == __last)
3005 return false;
3006 _BidirectionalIterator __i = __first;
3007 ++__i;
3008 if (__i == __last)
3009 return false;
3010 __i = __last;
3011 --__i;
3012
3013 for(;;)
3014 {
3015 _BidirectionalIterator __ii = __i;
3016 --__i;
3017 if (__comp(__ii, __i))
3018 {
3019 _BidirectionalIterator __j = __last;
3020 while (!__comp(--__j, __i))
3021 {}
3022 std::iter_swap(__i, __j);
3023 std::__reverse(__ii, __last,
3024 std::__iterator_category(__first));
3025 return true;
3026 }
3027 if (__i == __first)
3028 {
3029 std::__reverse(__first, __last,
3030 std::__iterator_category(__first));
3031 return false;
3032 }
3033 }
3034 }
3035
3036 /**
3037 * @brief Permute range into the previous @e dictionary ordering.
3038 * @ingroup sorting_algorithms
3039 * @param __first Start of range.
3040 * @param __last End of range.
3041 * @return False if wrapped to last permutation, true otherwise.
3042 *
3043 * Treats all permutations of the range as a set of @e dictionary sorted
3044 * sequences. Permutes the current sequence into the previous one of this
3045 * set. Returns true if there are more sequences to generate. If the
3046 * sequence is the smallest of the set, the largest is generated and false
3047 * returned.
3048 */
3049 template<typename _BidirectionalIterator>
3050 inline bool
3051 prev_permutation(_BidirectionalIterator __first,
3052 _BidirectionalIterator __last)
3053 {
3054 // concept requirements
3055 __glibcxx_function_requires(_BidirectionalIteratorConcept<
3056 _BidirectionalIterator>)
3057 __glibcxx_function_requires(_LessThanComparableConcept<
3058 typename iterator_traits<_BidirectionalIterator>::value_type>)
3059 __glibcxx_requires_valid_range(__first, __last);
3060 __glibcxx_requires_irreflexive(__first, __last);
3061
3062 return std::__prev_permutation(__first, __last,
3063 __gnu_cxx::__ops::__iter_less_iter());
3064 }
3065
3066 /**
3067 * @brief Permute range into the previous @e dictionary ordering using
3068 * comparison functor.
3069 * @ingroup sorting_algorithms
3070 * @param __first Start of range.
3071 * @param __last End of range.
3072 * @param __comp A comparison functor.
3073 * @return False if wrapped to last permutation, true otherwise.
3074 *
3075 * Treats all permutations of the range [__first,__last) as a set of
3076 * @e dictionary sorted sequences ordered by @p __comp. Permutes the current
3077 * sequence into the previous one of this set. Returns true if there are
3078 * more sequences to generate. If the sequence is the smallest of the set,
3079 * the largest is generated and false returned.
3080 */
3081 template<typename _BidirectionalIterator, typename _Compare>
3082 inline bool
3083 prev_permutation(_BidirectionalIterator __first,
3084 _BidirectionalIterator __last, _Compare __comp)
3085 {
3086 // concept requirements
3087 __glibcxx_function_requires(_BidirectionalIteratorConcept<
3088 _BidirectionalIterator>)
3089 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
3090 typename iterator_traits<_BidirectionalIterator>::value_type,
3091 typename iterator_traits<_BidirectionalIterator>::value_type>)
3092 __glibcxx_requires_valid_range(__first, __last);
3093 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
3094
3095 return std::__prev_permutation(__first, __last,
3096 __gnu_cxx::__ops::__iter_comp_iter(__comp));
3097 }
3098
3099 // replace
3100 // replace_if
3101
3102 template<typename _InputIterator, typename _OutputIterator,
3103 typename _Predicate, typename _Tp>
3104 _OutputIterator
3105 __replace_copy_if(_InputIterator __first, _InputIterator __last,
3106 _OutputIterator __result,
3107 _Predicate __pred, const _Tp& __new_value)
3108 {
3109 for (; __first != __last; ++__first, (void)++__result)
3110 if (__pred(__first))
3111 *__result = __new_value;
3112 else
3113 *__result = *__first;
3114 return __result;
3115 }
3116
3117 /**
3118 * @brief Copy a sequence, replacing each element of one value with another
3119 * value.
3120 * @param __first An input iterator.
3121 * @param __last An input iterator.
3122 * @param __result An output iterator.
3123 * @param __old_value The value to be replaced.
3124 * @param __new_value The replacement value.
3125 * @return The end of the output sequence, @p result+(last-first).
3126 *
3127 * Copies each element in the input range @p [__first,__last) to the
3128 * output range @p [__result,__result+(__last-__first)) replacing elements
3129 * equal to @p __old_value with @p __new_value.
3130 */
3131 template<typename _InputIterator, typename _OutputIterator, typename _Tp>
3132 inline _OutputIterator
3133 replace_copy(_InputIterator __first, _InputIterator __last,
3134 _OutputIterator __result,
3135 const _Tp& __old_value, const _Tp& __new_value)
3136 {
3137 // concept requirements
3138 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3139 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
3140 typename iterator_traits<_InputIterator>::value_type>)
3141 __glibcxx_function_requires(_EqualOpConcept<
3142 typename iterator_traits<_InputIterator>::value_type, _Tp>)
3143 __glibcxx_requires_valid_range(__first, __last);
3144
3145 return std::__replace_copy_if(__first, __last, __result,
3146 __gnu_cxx::__ops::__iter_equals_val(__old_value),
3147 __new_value);
3148 }
3149
3150 /**
3151 * @brief Copy a sequence, replacing each value for which a predicate
3152 * returns true with another value.
3153 * @ingroup mutating_algorithms
3154 * @param __first An input iterator.
3155 * @param __last An input iterator.
3156 * @param __result An output iterator.
3157 * @param __pred A predicate.
3158 * @param __new_value The replacement value.
3159 * @return The end of the output sequence, @p __result+(__last-__first).
3160 *
3161 * Copies each element in the range @p [__first,__last) to the range
3162 * @p [__result,__result+(__last-__first)) replacing elements for which
3163 * @p __pred returns true with @p __new_value.
3164 */
3165 template<typename _InputIterator, typename _OutputIterator,
3166 typename _Predicate, typename _Tp>
3167 inline _OutputIterator
3168 replace_copy_if(_InputIterator __first, _InputIterator __last,
3169 _OutputIterator __result,
3170 _Predicate __pred, const _Tp& __new_value)
3171 {
3172 // concept requirements
3173 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3174 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
3175 typename iterator_traits<_InputIterator>::value_type>)
3176 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
3177 typename iterator_traits<_InputIterator>::value_type>)
3178 __glibcxx_requires_valid_range(__first, __last);
3179
3180 return std::__replace_copy_if(__first, __last, __result,
3181 __gnu_cxx::__ops::__pred_iter(__pred),
3182 __new_value);
3183 }
3184
3185 template<typename _InputIterator, typename _Predicate>
3186 typename iterator_traits<_InputIterator>::difference_type
3187 __count_if(_InputIterator __first, _InputIterator __last, _Predicate __pred)
3188 {
3189 typename iterator_traits<_InputIterator>::difference_type __n = 0;
3190 for (; __first != __last; ++__first)
3191 if (__pred(__first))
3192 ++__n;
3193 return __n;
3194 }
3195
3196#if __cplusplus201402L >= 201103L
3197 /**
3198 * @brief Determines whether the elements of a sequence are sorted.
3199 * @ingroup sorting_algorithms
3200 * @param __first An iterator.
3201 * @param __last Another iterator.
3202 * @return True if the elements are sorted, false otherwise.
3203 */
3204 template<typename _ForwardIterator>
3205 inline bool
3206 is_sorted(_ForwardIterator __first, _ForwardIterator __last)
3207 { return std::is_sorted_until(__first, __last) == __last; }
3208
3209 /**
3210 * @brief Determines whether the elements of a sequence are sorted
3211 * according to a comparison functor.
3212 * @ingroup sorting_algorithms
3213 * @param __first An iterator.
3214 * @param __last Another iterator.
3215 * @param __comp A comparison functor.
3216 * @return True if the elements are sorted, false otherwise.
3217 */
3218 template<typename _ForwardIterator, typename _Compare>
3219 inline bool
3220 is_sorted(_ForwardIterator __first, _ForwardIterator __last,
3221 _Compare __comp)
3222 { return std::is_sorted_until(__first, __last, __comp) == __last; }
3223
3224 template<typename _ForwardIterator, typename _Compare>
3225 _ForwardIterator
3226 __is_sorted_until(_ForwardIterator __first, _ForwardIterator __last,
3227 _Compare __comp)
3228 {
3229 if (__first == __last)
3230 return __last;
3231
3232 _ForwardIterator __next = __first;
3233 for (++__next; __next != __last; __first = __next, (void)++__next)
3234 if (__comp(__next, __first))
3235 return __next;
3236 return __next;
3237 }
3238
3239 /**
3240 * @brief Determines the end of a sorted sequence.
3241 * @ingroup sorting_algorithms
3242 * @param __first An iterator.
3243 * @param __last Another iterator.
3244 * @return An iterator pointing to the last iterator i in [__first, __last)
3245 * for which the range [__first, i) is sorted.
3246 */
3247 template<typename _ForwardIterator>
3248 inline _ForwardIterator
3249 is_sorted_until(_ForwardIterator __first, _ForwardIterator __last)
3250 {
3251 // concept requirements
3252 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3253 __glibcxx_function_requires(_LessThanComparableConcept<
3254 typename iterator_traits<_ForwardIterator>::value_type>)
3255 __glibcxx_requires_valid_range(__first, __last);
3256 __glibcxx_requires_irreflexive(__first, __last);
3257
3258 return std::__is_sorted_until(__first, __last,
3259 __gnu_cxx::__ops::__iter_less_iter());
3260 }
3261
3262 /**
3263 * @brief Determines the end of a sorted sequence using comparison functor.
3264 * @ingroup sorting_algorithms
3265 * @param __first An iterator.
3266 * @param __last Another iterator.
3267 * @param __comp A comparison functor.
3268 * @return An iterator pointing to the last iterator i in [__first, __last)
3269 * for which the range [__first, i) is sorted.
3270 */
3271 template<typename _ForwardIterator, typename _Compare>
3272 inline _ForwardIterator
3273 is_sorted_until(_ForwardIterator __first, _ForwardIterator __last,
3274 _Compare __comp)
3275 {
3276 // concept requirements
3277 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3278 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
3279 typename iterator_traits<_ForwardIterator>::value_type,
3280 typename iterator_traits<_ForwardIterator>::value_type>)
3281 __glibcxx_requires_valid_range(__first, __last);
3282 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
3283
3284 return std::__is_sorted_until(__first, __last,
3285 __gnu_cxx::__ops::__iter_comp_iter(__comp));
3286 }
3287
3288 /**
3289 * @brief Determines min and max at once as an ordered pair.
3290 * @ingroup sorting_algorithms
3291 * @param __a A thing of arbitrary type.
3292 * @param __b Another thing of arbitrary type.
3293 * @return A pair(__b, __a) if __b is smaller than __a, pair(__a,
3294 * __b) otherwise.
3295 */
3296 template<typename _Tp>
3297 _GLIBCXX14_CONSTEXPRconstexpr
3298 inline pair<const _Tp&, const _Tp&>
3299 minmax(const _Tp& __a, const _Tp& __b)
3300 {
3301 // concept requirements
3302 __glibcxx_function_requires(_LessThanComparableConcept<_Tp>)
3303
3304 return __b < __a ? pair<const _Tp&, const _Tp&>(__b, __a)
3305 : pair<const _Tp&, const _Tp&>(__a, __b);
3306 }
3307
3308 /**
3309 * @brief Determines min and max at once as an ordered pair.
3310 * @ingroup sorting_algorithms
3311 * @param __a A thing of arbitrary type.
3312 * @param __b Another thing of arbitrary type.
3313 * @param __comp A @link comparison_functors comparison functor @endlink.
3314 * @return A pair(__b, __a) if __b is smaller than __a, pair(__a,
3315 * __b) otherwise.
3316 */
3317 template<typename _Tp, typename _Compare>
3318 _GLIBCXX14_CONSTEXPRconstexpr
3319 inline pair<const _Tp&, const _Tp&>
3320 minmax(const _Tp& __a, const _Tp& __b, _Compare __comp)
3321 {
3322 return __comp(__b, __a) ? pair<const _Tp&, const _Tp&>(__b, __a)
3323 : pair<const _Tp&, const _Tp&>(__a, __b);
3324 }
3325
3326 template<typename _ForwardIterator, typename _Compare>
3327 _GLIBCXX14_CONSTEXPRconstexpr
3328 pair<_ForwardIterator, _ForwardIterator>
3329 __minmax_element(_ForwardIterator __first, _ForwardIterator __last,
3330 _Compare __comp)
3331 {
3332 _ForwardIterator __next = __first;
3333 if (__first == __last
3334 || ++__next == __last)
3335 return std::make_pair(__first, __first);
3336
3337 _ForwardIterator __min{}, __max{};
3338 if (__comp(__next, __first))
3339 {
3340 __min = __next;
3341 __max = __first;
3342 }
3343 else
3344 {
3345 __min = __first;
3346 __max = __next;
3347 }
3348
3349 __first = __next;
3350 ++__first;
3351
3352 while (__first != __last)
3353 {
3354 __next = __first;
3355 if (++__next == __last)
3356 {
3357 if (__comp(__first, __min))
3358 __min = __first;
3359 else if (!__comp(__first, __max))
3360 __max = __first;
3361 break;
3362 }
3363
3364 if (__comp(__next, __first))
3365 {
3366 if (__comp(__next, __min))
3367 __min = __next;
3368 if (!__comp(__first, __max))
3369 __max = __first;
3370 }
3371 else
3372 {
3373 if (__comp(__first, __min))
3374 __min = __first;
3375 if (!__comp(__next, __max))
3376 __max = __next;
3377 }
3378
3379 __first = __next;
3380 ++__first;
3381 }
3382
3383 return std::make_pair(__min, __max);
3384 }
3385
3386 /**
3387 * @brief Return a pair of iterators pointing to the minimum and maximum
3388 * elements in a range.
3389 * @ingroup sorting_algorithms
3390 * @param __first Start of range.
3391 * @param __last End of range.
3392 * @return make_pair(m, M), where m is the first iterator i in
3393 * [__first, __last) such that no other element in the range is
3394 * smaller, and where M is the last iterator i in [__first, __last)
3395 * such that no other element in the range is larger.
3396 */
3397 template<typename _ForwardIterator>
3398 _GLIBCXX14_CONSTEXPRconstexpr
3399 inline pair<_ForwardIterator, _ForwardIterator>
3400 minmax_element(_ForwardIterator __first, _ForwardIterator __last)
3401 {
3402 // concept requirements
3403 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3404 __glibcxx_function_requires(_LessThanComparableConcept<
3405 typename iterator_traits<_ForwardIterator>::value_type>)
3406 __glibcxx_requires_valid_range(__first, __last);
3407 __glibcxx_requires_irreflexive(__first, __last);
3408
3409 return std::__minmax_element(__first, __last,
3410 __gnu_cxx::__ops::__iter_less_iter());
3411 }
3412
3413 /**
3414 * @brief Return a pair of iterators pointing to the minimum and maximum
3415 * elements in a range.
3416 * @ingroup sorting_algorithms
3417 * @param __first Start of range.
3418 * @param __last End of range.
3419 * @param __comp Comparison functor.
3420 * @return make_pair(m, M), where m is the first iterator i in
3421 * [__first, __last) such that no other element in the range is
3422 * smaller, and where M is the last iterator i in [__first, __last)
3423 * such that no other element in the range is larger.
3424 */
3425 template<typename _ForwardIterator, typename _Compare>
3426 _GLIBCXX14_CONSTEXPRconstexpr
3427 inline pair<_ForwardIterator, _ForwardIterator>
3428 minmax_element(_ForwardIterator __first, _ForwardIterator __last,
3429 _Compare __comp)
3430 {
3431 // concept requirements
3432 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3433 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
3434 typename iterator_traits<_ForwardIterator>::value_type,
3435 typename iterator_traits<_ForwardIterator>::value_type>)
3436 __glibcxx_requires_valid_range(__first, __last);
3437 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
3438
3439 return std::__minmax_element(__first, __last,
3440 __gnu_cxx::__ops::__iter_comp_iter(__comp));
3441 }
3442
3443 // N2722 + DR 915.
3444 template<typename _Tp>
3445 _GLIBCXX14_CONSTEXPRconstexpr
3446 inline _Tp
3447 min(initializer_list<_Tp> __l)
3448 { return *std::min_element(__l.begin(), __l.end()); }
3449
3450 template<typename _Tp, typename _Compare>
3451 _GLIBCXX14_CONSTEXPRconstexpr
3452 inline _Tp
3453 min(initializer_list<_Tp> __l, _Compare __comp)
3454 { return *std::min_element(__l.begin(), __l.end(), __comp); }
3455
3456 template<typename _Tp>
3457 _GLIBCXX14_CONSTEXPRconstexpr
3458 inline _Tp
3459 max(initializer_list<_Tp> __l)
3460 { return *std::max_element(__l.begin(), __l.end()); }
3461
3462 template<typename _Tp, typename _Compare>
3463 _GLIBCXX14_CONSTEXPRconstexpr
3464 inline _Tp
3465 max(initializer_list<_Tp> __l, _Compare __comp)
3466 { return *std::max_element(__l.begin(), __l.end(), __comp); }
3467
3468 template<typename _Tp>
3469 _GLIBCXX14_CONSTEXPRconstexpr
3470 inline pair<_Tp, _Tp>
3471 minmax(initializer_list<_Tp> __l)
3472 {
3473 pair<const _Tp*, const _Tp*> __p =
3474 std::minmax_element(__l.begin(), __l.end());
3475 return std::make_pair(*__p.first, *__p.second);
3476 }
3477
3478 template<typename _Tp, typename _Compare>
3479 _GLIBCXX14_CONSTEXPRconstexpr
3480 inline pair<_Tp, _Tp>
3481 minmax(initializer_list<_Tp> __l, _Compare __comp)
3482 {
3483 pair<const _Tp*, const _Tp*> __p =
3484 std::minmax_element(__l.begin(), __l.end(), __comp);
3485 return std::make_pair(*__p.first, *__p.second);
3486 }
3487
3488 template<typename _ForwardIterator1, typename _ForwardIterator2,
3489 typename _BinaryPredicate>
3490 bool
3491 __is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3492 _ForwardIterator2 __first2, _BinaryPredicate __pred)
3493 {
3494 // Efficiently compare identical prefixes: O(N) if sequences
3495 // have the same elements in the same order.
3496 for (; __first1 != __last1; ++__first1, (void)++__first2)
3497 if (!__pred(__first1, __first2))
3498 break;
3499
3500 if (__first1 == __last1)
3501 return true;
3502
3503 // Establish __last2 assuming equal ranges by iterating over the
3504 // rest of the list.
3505 _ForwardIterator2 __last2 = __first2;
3506 std::advance(__last2, std::distance(__first1, __last1));
3507 for (_ForwardIterator1 __scan = __first1; __scan != __last1; ++__scan)
3508 {
3509 if (__scan != std::__find_if(__first1, __scan,
3510 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan)))
3511 continue; // We've seen this one before.
3512
3513 auto __matches
3514 = std::__count_if(__first2, __last2,
3515 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan));
3516 if (0 == __matches ||
3517 std::__count_if(__scan, __last1,
3518 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan))
3519 != __matches)
3520 return false;
3521 }
3522 return true;
3523 }
3524
3525 /**
3526 * @brief Checks whether a permutation of the second sequence is equal
3527 * to the first sequence.
3528 * @ingroup non_mutating_algorithms
3529 * @param __first1 Start of first range.
3530 * @param __last1 End of first range.
3531 * @param __first2 Start of second range.
3532 * @return true if there exists a permutation of the elements in the range
3533 * [__first2, __first2 + (__last1 - __first1)), beginning with
3534 * ForwardIterator2 begin, such that equal(__first1, __last1, begin)
3535 * returns true; otherwise, returns false.
3536 */
3537 template<typename _ForwardIterator1, typename _ForwardIterator2>
3538 inline bool
3539 is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3540 _ForwardIterator2 __first2)
3541 {
3542 // concept requirements
3543 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
3544 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
3545 __glibcxx_function_requires(_EqualOpConcept<
3546 typename iterator_traits<_ForwardIterator1>::value_type,
3547 typename iterator_traits<_ForwardIterator2>::value_type>)
3548 __glibcxx_requires_valid_range(__first1, __last1);
3549
3550 return std::__is_permutation(__first1, __last1, __first2,
3551 __gnu_cxx::__ops::__iter_equal_to_iter());
3552 }
3553
3554 /**
3555 * @brief Checks whether a permutation of the second sequence is equal
3556 * to the first sequence.
3557 * @ingroup non_mutating_algorithms
3558 * @param __first1 Start of first range.
3559 * @param __last1 End of first range.
3560 * @param __first2 Start of second range.
3561 * @param __pred A binary predicate.
3562 * @return true if there exists a permutation of the elements in
3563 * the range [__first2, __first2 + (__last1 - __first1)),
3564 * beginning with ForwardIterator2 begin, such that
3565 * equal(__first1, __last1, __begin, __pred) returns true;
3566 * otherwise, returns false.
3567 */
3568 template<typename _ForwardIterator1, typename _ForwardIterator2,
3569 typename _BinaryPredicate>
3570 inline bool
3571 is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3572 _ForwardIterator2 __first2, _BinaryPredicate __pred)
3573 {
3574 // concept requirements
3575 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
3576 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
3577 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
3578 typename iterator_traits<_ForwardIterator1>::value_type,
3579 typename iterator_traits<_ForwardIterator2>::value_type>)
3580 __glibcxx_requires_valid_range(__first1, __last1);
3581
3582 return std::__is_permutation(__first1, __last1, __first2,
3583 __gnu_cxx::__ops::__iter_comp_iter(__pred));
3584 }
3585
3586#if __cplusplus201402L > 201103L
3587 template<typename _ForwardIterator1, typename _ForwardIterator2,
3588 typename _BinaryPredicate>
3589 bool
3590 __is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3591 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
3592 _BinaryPredicate __pred)
3593 {
3594 using _Cat1
3595 = typename iterator_traits<_ForwardIterator1>::iterator_category;
3596 using _Cat2
3597 = typename iterator_traits<_ForwardIterator2>::iterator_category;
3598 using _It1_is_RA = is_same<_Cat1, random_access_iterator_tag>;
3599 using _It2_is_RA = is_same<_Cat2, random_access_iterator_tag>;
3600 constexpr bool __ra_iters = _It1_is_RA() && _It2_is_RA();
3601 if (__ra_iters)
3602 {
3603 auto __d1 = std::distance(__first1, __last1);
3604 auto __d2 = std::distance(__first2, __last2);
3605 if (__d1 != __d2)
3606 return false;
3607 }
3608
3609 // Efficiently compare identical prefixes: O(N) if sequences
3610 // have the same elements in the same order.
3611 for (; __first1 != __last1 && __first2 != __last2;
3612 ++__first1, (void)++__first2)
3613 if (!__pred(__first1, __first2))
3614 break;
3615
3616 if (__ra_iters)
3617 {
3618 if (__first1 == __last1)
3619 return true;
3620 }
3621 else
3622 {
3623 auto __d1 = std::distance(__first1, __last1);
3624 auto __d2 = std::distance(__first2, __last2);
3625 if (__d1 == 0 && __d2 == 0)
3626 return true;
3627 if (__d1 != __d2)
3628 return false;
3629 }
3630
3631 for (_ForwardIterator1 __scan = __first1; __scan != __last1; ++__scan)
3632 {
3633 if (__scan != std::__find_if(__first1, __scan,
3634 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan)))
3635 continue; // We've seen this one before.
3636
3637 auto __matches = std::__count_if(__first2, __last2,
3638 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan));
3639 if (0 == __matches
3640 || std::__count_if(__scan, __last1,
3641 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan))
3642 != __matches)
3643 return false;
3644 }
3645 return true;
3646 }
3647
3648 /**
3649 * @brief Checks whether a permutaion of the second sequence is equal
3650 * to the first sequence.
3651 * @ingroup non_mutating_algorithms
3652 * @param __first1 Start of first range.
3653 * @param __last1 End of first range.
3654 * @param __first2 Start of second range.
3655 * @param __last2 End of first range.
3656 * @return true if there exists a permutation of the elements in the range
3657 * [__first2, __last2), beginning with ForwardIterator2 begin,
3658 * such that equal(__first1, __last1, begin) returns true;
3659 * otherwise, returns false.
3660 */
3661 template<typename _ForwardIterator1, typename _ForwardIterator2>
3662 inline bool
3663 is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3664 _ForwardIterator2 __first2, _ForwardIterator2 __last2)
3665 {
3666 __glibcxx_requires_valid_range(__first1, __last1);
3667 __glibcxx_requires_valid_range(__first2, __last2);
3668
3669 return
3670 std::__is_permutation(__first1, __last1, __first2, __last2,
3671 __gnu_cxx::__ops::__iter_equal_to_iter());
3672 }
3673
3674 /**
3675 * @brief Checks whether a permutation of the second sequence is equal
3676 * to the first sequence.
3677 * @ingroup non_mutating_algorithms
3678 * @param __first1 Start of first range.
3679 * @param __last1 End of first range.
3680 * @param __first2 Start of second range.
3681 * @param __last2 End of first range.
3682 * @param __pred A binary predicate.
3683 * @return true if there exists a permutation of the elements in the range
3684 * [__first2, __last2), beginning with ForwardIterator2 begin,
3685 * such that equal(__first1, __last1, __begin, __pred) returns true;
3686 * otherwise, returns false.
3687 */
3688 template<typename _ForwardIterator1, typename _ForwardIterator2,
3689 typename _BinaryPredicate>
3690 inline bool
3691 is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3692 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
3693 _BinaryPredicate __pred)
3694 {
3695 __glibcxx_requires_valid_range(__first1, __last1);
3696 __glibcxx_requires_valid_range(__first2, __last2);
3697
3698 return std::__is_permutation(__first1, __last1, __first2, __last2,
3699 __gnu_cxx::__ops::__iter_comp_iter(__pred));
3700 }
3701#endif
3702
3703#ifdef _GLIBCXX_USE_C99_STDINT_TR11
3704 /**
3705 * @brief Shuffle the elements of a sequence using a uniform random
3706 * number generator.
3707 * @ingroup mutating_algorithms
3708 * @param __first A forward iterator.
3709 * @param __last A forward iterator.
3710 * @param __g A UniformRandomNumberGenerator (26.5.1.3).
3711 * @return Nothing.
3712 *
3713 * Reorders the elements in the range @p [__first,__last) using @p __g to
3714 * provide random numbers.
3715 */
3716 template<typename _RandomAccessIterator,
3717 typename _UniformRandomNumberGenerator>
3718 void
3719 shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last,
3720 _UniformRandomNumberGenerator&& __g)
3721 {
3722 // concept requirements
3723 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
3724 _RandomAccessIterator>)
3725 __glibcxx_requires_valid_range(__first, __last);
3726
3727 if (__first == __last)
3728 return;
3729
3730 typedef typename iterator_traits<_RandomAccessIterator>::difference_type
3731 _DistanceType;
3732
3733 typedef typename std::make_unsigned<_DistanceType>::type __ud_type;
3734 typedef typename std::uniform_int_distribution<__ud_type> __distr_type;
3735 typedef typename __distr_type::param_type __p_type;
3736 __distr_type __d;
3737
3738 for (_RandomAccessIterator __i = __first + 1; __i != __last; ++__i)
3739 std::iter_swap(__i, __first + __d(__g, __p_type(0, __i - __first)));
3740 }
3741#endif
3742
3743#endif // C++11
3744
3745_GLIBCXX_END_NAMESPACE_VERSION
3746
3747_GLIBCXX_BEGIN_NAMESPACE_ALGO
3748
3749 /**
3750 * @brief Apply a function to every element of a sequence.
3751 * @ingroup non_mutating_algorithms
3752 * @param __first An input iterator.
3753 * @param __last An input iterator.
3754 * @param __f A unary function object.
3755 * @return @p __f (std::move(@p __f) in C++0x).
3756 *
3757 * Applies the function object @p __f to each element in the range
3758 * @p [first,last). @p __f must not modify the order of the sequence.
3759 * If @p __f has a return value it is ignored.
3760 */
3761 template<typename _InputIterator, typename _Function>
3762 _Function
3763 for_each(_InputIterator __first, _InputIterator __last, _Function __f)
3764 {
3765 // concept requirements
3766 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3767 __glibcxx_requires_valid_range(__first, __last);
3768 for (; __first != __last; ++__first)
3769 __f(*__first);
3770 return _GLIBCXX_MOVE(__f)std::move(__f);
3771 }
3772
3773 /**
3774 * @brief Find the first occurrence of a value in a sequence.
3775 * @ingroup non_mutating_algorithms
3776 * @param __first An input iterator.
3777 * @param __last An input iterator.
3778 * @param __val The value to find.
3779 * @return The first iterator @c i in the range @p [__first,__last)
3780 * such that @c *i == @p __val, or @p __last if no such iterator exists.
3781 */
3782 template<typename _InputIterator, typename _Tp>
3783 inline _InputIterator
3784 find(_InputIterator __first, _InputIterator __last,
3785 const _Tp& __val)
3786 {
3787 // concept requirements
3788 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3789 __glibcxx_function_requires(_EqualOpConcept<
3790 typename iterator_traits<_InputIterator>::value_type, _Tp>)
3791 __glibcxx_requires_valid_range(__first, __last);
3792 return std::__find_if(__first, __last,
3793 __gnu_cxx::__ops::__iter_equals_val(__val));
3794 }
3795
3796 /**
3797 * @brief Find the first element in a sequence for which a
3798 * predicate is true.
3799 * @ingroup non_mutating_algorithms
3800 * @param __first An input iterator.
3801 * @param __last An input iterator.
3802 * @param __pred A predicate.
3803 * @return The first iterator @c i in the range @p [__first,__last)
3804 * such that @p __pred(*i) is true, or @p __last if no such iterator exists.
3805 */
3806 template<typename _InputIterator, typename _Predicate>
3807 inline _InputIterator
3808 find_if(_InputIterator __first, _InputIterator __last,
3809 _Predicate __pred)
3810 {
3811 // concept requirements
3812 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3813 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
3814 typename iterator_traits<_InputIterator>::value_type>)
3815 __glibcxx_requires_valid_range(__first, __last);
3816
3817 return std::__find_if(__first, __last,
3818 __gnu_cxx::__ops::__pred_iter(__pred));
3819 }
3820
3821 /**
3822 * @brief Find element from a set in a sequence.
3823 * @ingroup non_mutating_algorithms
3824 * @param __first1 Start of range to search.
3825 * @param __last1 End of range to search.
3826 * @param __first2 Start of match candidates.
3827 * @param __last2 End of match candidates.
3828 * @return The first iterator @c i in the range
3829 * @p [__first1,__last1) such that @c *i == @p *(i2) such that i2 is an
3830 * iterator in [__first2,__last2), or @p __last1 if no such iterator exists.
3831 *
3832 * Searches the range @p [__first1,__last1) for an element that is
3833 * equal to some element in the range [__first2,__last2). If
3834 * found, returns an iterator in the range [__first1,__last1),
3835 * otherwise returns @p __last1.
3836 */
3837 template<typename _InputIterator, typename _ForwardIterator>
3838 _InputIterator
3839 find_first_of(_InputIterator __first1, _InputIterator __last1,
3840 _ForwardIterator __first2, _ForwardIterator __last2)
3841 {
3842 // concept requirements
3843 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3844 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3845 __glibcxx_function_requires(_EqualOpConcept<
3846 typename iterator_traits<_InputIterator>::value_type,
3847 typename iterator_traits<_ForwardIterator>::value_type>)
3848 __glibcxx_requires_valid_range(__first1, __last1);
3849 __glibcxx_requires_valid_range(__first2, __last2);
3850
3851 for (; __first1 != __last1; ++__first1)
3852 for (_ForwardIterator __iter = __first2; __iter != __last2; ++__iter)
3853 if (*__first1 == *__iter)
3854 return __first1;
3855 return __last1;
3856 }
3857
3858 /**
3859 * @brief Find element from a set in a sequence using a predicate.
3860 * @ingroup non_mutating_algorithms
3861 * @param __first1 Start of range to search.
3862 * @param __last1 End of range to search.
3863 * @param __first2 Start of match candidates.
3864 * @param __last2 End of match candidates.
3865 * @param __comp Predicate to use.
3866 * @return The first iterator @c i in the range
3867 * @p [__first1,__last1) such that @c comp(*i, @p *(i2)) is true
3868 * and i2 is an iterator in [__first2,__last2), or @p __last1 if no
3869 * such iterator exists.
3870 *
3871
3872 * Searches the range @p [__first1,__last1) for an element that is
3873 * equal to some element in the range [__first2,__last2). If
3874 * found, returns an iterator in the range [__first1,__last1),
3875 * otherwise returns @p __last1.
3876 */
3877 template<typename _InputIterator, typename _ForwardIterator,
3878 typename _BinaryPredicate>
3879 _InputIterator
3880 find_first_of(_InputIterator __first1, _InputIterator __last1,
3881 _ForwardIterator __first2, _ForwardIterator __last2,
3882 _BinaryPredicate __comp)
3883 {
3884 // concept requirements
3885 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3886 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3887 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
3888 typename iterator_traits<_InputIterator>::value_type,
3889 typename iterator_traits<_ForwardIterator>::value_type>)
3890 __glibcxx_requires_valid_range(__first1, __last1);
3891 __glibcxx_requires_valid_range(__first2, __last2);
3892
3893 for (; __first1 != __last1; ++__first1)
3894 for (_ForwardIterator __iter = __first2; __iter != __last2; ++__iter)
3895 if (__comp(*__first1, *__iter))
3896 return __first1;
3897 return __last1;
3898 }
3899
3900 /**
3901 * @brief Find two adjacent values in a sequence that are equal.
3902 * @ingroup non_mutating_algorithms
3903 * @param __first A forward iterator.
3904 * @param __last A forward iterator.
3905 * @return The first iterator @c i such that @c i and @c i+1 are both
3906 * valid iterators in @p [__first,__last) and such that @c *i == @c *(i+1),
3907 * or @p __last if no such iterator exists.
3908 */
3909 template<typename _ForwardIterator>
3910 inline _ForwardIterator
3911 adjacent_find(_ForwardIterator __first, _ForwardIterator __last)
3912 {
3913 // concept requirements
3914 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3915 __glibcxx_function_requires(_EqualityComparableConcept<
3916 typename iterator_traits<_ForwardIterator>::value_type>)
3917 __glibcxx_requires_valid_range(__first, __last);
3918
3919 return std::__adjacent_find(__first, __last,
3920 __gnu_cxx::__ops::__iter_equal_to_iter());
3921 }
3922
3923 /**
3924 * @brief Find two adjacent values in a sequence using a predicate.
3925 * @ingroup non_mutating_algorithms
3926 * @param __first A forward iterator.
3927 * @param __last A forward iterator.
3928 * @param __binary_pred A binary predicate.
3929 * @return The first iterator @c i such that @c i and @c i+1 are both
3930 * valid iterators in @p [__first,__last) and such that
3931 * @p __binary_pred(*i,*(i+1)) is true, or @p __last if no such iterator
3932 * exists.
3933 */
3934 template<typename _ForwardIterator, typename _BinaryPredicate>
3935 inline _ForwardIterator
3936 adjacent_find(_ForwardIterator __first, _ForwardIterator __last,
3937 _BinaryPredicate __binary_pred)
3938 {
3939 // concept requirements
3940 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3941 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
3942 typename iterator_traits<_ForwardIterator>::value_type,
3943 typename iterator_traits<_ForwardIterator>::value_type>)
3944 __glibcxx_requires_valid_range(__first, __last);
3945
3946 return std::__adjacent_find(__first, __last,
3947 __gnu_cxx::__ops::__iter_comp_iter(__binary_pred));
3948 }
3949
3950 /**
3951 * @brief Count the number of copies of a value in a sequence.
3952 * @ingroup non_mutating_algorithms
3953 * @param __first An input iterator.
3954 * @param __last An input iterator.
3955 * @param __value The value to be counted.
3956 * @return The number of iterators @c i in the range @p [__first,__last)
3957 * for which @c *i == @p __value
3958 */
3959 template<typename _InputIterator, typename _Tp>
3960 inline typename iterator_traits<_InputIterator>::difference_type
3961 count(_InputIterator __first, _InputIterator __last, const _Tp& __value)
3962 {
3963 // concept requirements
3964 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3965 __glibcxx_function_requires(_EqualOpConcept<
3966 typename iterator_traits<_InputIterator>::value_type, _Tp>)
3967 __glibcxx_requires_valid_range(__first, __last);
3968
3969 return std::__count_if(__first, __last,
3970 __gnu_cxx::__ops::__iter_equals_val(__value));
3971 }
3972
3973 /**
3974 * @brief Count the elements of a sequence for which a predicate is true.
3975 * @ingroup non_mutating_algorithms
3976 * @param __first An input iterator.
3977 * @param __last An input iterator.
3978 * @param __pred A predicate.
3979 * @return The number of iterators @c i in the range @p [__first,__last)
3980 * for which @p __pred(*i) is true.
3981 */
3982 template<typename _InputIterator, typename _Predicate>
3983 inline typename iterator_traits<_InputIterator>::difference_type
3984 count_if(_InputIterator __first, _InputIterator __last, _Predicate __pred)
3985 {
3986 // concept requirements
3987 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3988 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
3989 typename iterator_traits<_InputIterator>::value_type>)
3990 __glibcxx_requires_valid_range(__first, __last);
3991
3992 return std::__count_if(__first, __last,
3993 __gnu_cxx::__ops::__pred_iter(__pred));
3994 }
3995
3996 /**
3997 * @brief Search a sequence for a matching sub-sequence.
3998 * @ingroup non_mutating_algorithms
3999 * @param __first1 A forward iterator.
4000 * @param __last1 A forward iterator.
4001 * @param __first2 A forward iterator.
4002 * @param __last2 A forward iterator.
4003 * @return The first iterator @c i in the range @p
4004 * [__first1,__last1-(__last2-__first2)) such that @c *(i+N) == @p
4005 * *(__first2+N) for each @c N in the range @p
4006 * [0,__last2-__first2), or @p __last1 if no such iterator exists.
4007 *
4008 * Searches the range @p [__first1,__last1) for a sub-sequence that
4009 * compares equal value-by-value with the sequence given by @p
4010 * [__first2,__last2) and returns an iterator to the first element
4011 * of the sub-sequence, or @p __last1 if the sub-sequence is not
4012 * found.
4013 *
4014 * Because the sub-sequence must lie completely within the range @p
4015 * [__first1,__last1) it must start at a position less than @p
4016 * __last1-(__last2-__first2) where @p __last2-__first2 is the
4017 * length of the sub-sequence.
4018 *
4019 * This means that the returned iterator @c i will be in the range
4020 * @p [__first1,__last1-(__last2-__first2))
4021 */
4022 template<typename _ForwardIterator1, typename _ForwardIterator2>
4023 inline _ForwardIterator1
4024 search(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
4025 _ForwardIterator2 __first2, _ForwardIterator2 __last2)
4026 {
4027 // concept requirements
4028 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
4029 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
4030 __glibcxx_function_requires(_EqualOpConcept<
4031 typename iterator_traits<_ForwardIterator1>::value_type,
4032 typename iterator_traits<_ForwardIterator2>::value_type>)
4033 __glibcxx_requires_valid_range(__first1, __last1);
4034 __glibcxx_requires_valid_range(__first2, __last2);
4035
4036 return std::__search(__first1, __last1, __first2, __last2,
4037 __gnu_cxx::__ops::__iter_equal_to_iter());
4038 }
4039
4040 /**
4041 * @brief Search a sequence for a matching sub-sequence using a predicate.
4042 * @ingroup non_mutating_algorithms
4043 * @param __first1 A forward iterator.
4044 * @param __last1 A forward iterator.
4045 * @param __first2 A forward iterator.
4046 * @param __last2 A forward iterator.
4047 * @param __predicate A binary predicate.
4048 * @return The first iterator @c i in the range
4049 * @p [__first1,__last1-(__last2-__first2)) such that
4050 * @p __predicate(*(i+N),*(__first2+N)) is true for each @c N in the range
4051 * @p [0,__last2-__first2), or @p __last1 if no such iterator exists.
4052 *
4053 * Searches the range @p [__first1,__last1) for a sub-sequence that
4054 * compares equal value-by-value with the sequence given by @p
4055 * [__first2,__last2), using @p __predicate to determine equality,
4056 * and returns an iterator to the first element of the
4057 * sub-sequence, or @p __last1 if no such iterator exists.
4058 *
4059 * @see search(_ForwardIter1, _ForwardIter1, _ForwardIter2, _ForwardIter2)
4060 */
4061 template<typename _ForwardIterator1, typename _ForwardIterator2,
4062 typename _BinaryPredicate>
4063 inline _ForwardIterator1
4064 search(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
4065 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
4066 _BinaryPredicate __predicate)
4067 {
4068 // concept requirements
4069 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
4070 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
4071 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
4072 typename iterator_traits<_ForwardIterator1>::value_type,
4073 typename iterator_traits<_ForwardIterator2>::value_type>)
4074 __glibcxx_requires_valid_range(__first1, __last1);
4075 __glibcxx_requires_valid_range(__first2, __last2);
4076
4077 return std::__search(__first1, __last1, __first2, __last2,
4078 __gnu_cxx::__ops::__iter_comp_iter(__predicate));
4079 }
4080
4081 /**
4082 * @brief Search a sequence for a number of consecutive values.
4083 * @ingroup non_mutating_algorithms
4084 * @param __first A forward iterator.
4085 * @param __last A forward iterator.
4086 * @param __count The number of consecutive values.
4087 * @param __val The value to find.
4088 * @return The first iterator @c i in the range @p
4089 * [__first,__last-__count) such that @c *(i+N) == @p __val for
4090 * each @c N in the range @p [0,__count), or @p __last if no such
4091 * iterator exists.
4092 *
4093 * Searches the range @p [__first,__last) for @p count consecutive elements
4094 * equal to @p __val.
4095 */
4096 template<typename _ForwardIterator, typename _Integer, typename _Tp>
4097 inline _ForwardIterator
4098 search_n(_ForwardIterator __first, _ForwardIterator __last,
4099 _Integer __count, const _Tp& __val)
4100 {
4101 // concept requirements
4102 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
4103 __glibcxx_function_requires(_EqualOpConcept<
4104 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
4105 __glibcxx_requires_valid_range(__first, __last);
4106
4107 return std::__search_n(__first, __last, __count,
4108 __gnu_cxx::__ops::__iter_equals_val(__val));
4109 }
4110
4111
4112 /**
4113 * @brief Search a sequence for a number of consecutive values using a
4114 * predicate.
4115 * @ingroup non_mutating_algorithms
4116 * @param __first A forward iterator.
4117 * @param __last A forward iterator.
4118 * @param __count The number of consecutive values.
4119 * @param __val The value to find.
4120 * @param __binary_pred A binary predicate.
4121 * @return The first iterator @c i in the range @p
4122 * [__first,__last-__count) such that @p
4123 * __binary_pred(*(i+N),__val) is true for each @c N in the range
4124 * @p [0,__count), or @p __last if no such iterator exists.
4125 *
4126 * Searches the range @p [__first,__last) for @p __count
4127 * consecutive elements for which the predicate returns true.
4128 */
4129 template<typename _ForwardIterator, typename _Integer, typename _Tp,
4130 typename _BinaryPredicate>
4131 inline _ForwardIterator
4132 search_n(_ForwardIterator __first, _ForwardIterator __last,
4133 _Integer __count, const _Tp& __val,
4134 _BinaryPredicate __binary_pred)
4135 {
4136 // concept requirements
4137 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
4138 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
4139 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
4140 __glibcxx_requires_valid_range(__first, __last);
4141
4142 return std::__search_n(__first, __last, __count,
4143 __gnu_cxx::__ops::__iter_comp_val(__binary_pred, __val));
4144 }
4145
4146
4147 /**
4148 * @brief Perform an operation on a sequence.
4149 * @ingroup mutating_algorithms
4150 * @param __first An input iterator.
4151 * @param __last An input iterator.
4152 * @param __result An output iterator.
4153 * @param __unary_op A unary operator.
4154 * @return An output iterator equal to @p __result+(__last-__first).
4155 *
4156 * Applies the operator to each element in the input range and assigns
4157 * the results to successive elements of the output sequence.
4158 * Evaluates @p *(__result+N)=unary_op(*(__first+N)) for each @c N in the
4159 * range @p [0,__last-__first).
4160 *
4161 * @p unary_op must not alter its argument.
4162 */
4163 template<typename _InputIterator, typename _OutputIterator,
4164 typename _UnaryOperation>
4165 _OutputIterator
4166 transform(_InputIterator __first, _InputIterator __last,
4167 _OutputIterator __result, _UnaryOperation __unary_op)
4168 {
4169 // concept requirements
4170 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
4171 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4172 // "the type returned by a _UnaryOperation"
4173 __typeof__(__unary_op(*__first))>)
4174 __glibcxx_requires_valid_range(__first, __last);
4175
4176 for (; __first != __last; ++__first, (void)++__result)
4177 *__result = __unary_op(*__first);
4178 return __result;
4179 }
4180
4181 /**
4182 * @brief Perform an operation on corresponding elements of two sequences.
4183 * @ingroup mutating_algorithms
4184 * @param __first1 An input iterator.
4185 * @param __last1 An input iterator.
4186 * @param __first2 An input iterator.
4187 * @param __result An output iterator.
4188 * @param __binary_op A binary operator.
4189 * @return An output iterator equal to @p result+(last-first).
4190 *
4191 * Applies the operator to the corresponding elements in the two
4192 * input ranges and assigns the results to successive elements of the
4193 * output sequence.
4194 * Evaluates @p
4195 * *(__result+N)=__binary_op(*(__first1+N),*(__first2+N)) for each
4196 * @c N in the range @p [0,__last1-__first1).
4197 *
4198 * @p binary_op must not alter either of its arguments.
4199 */
4200 template<typename _InputIterator1, typename _InputIterator2,
4201 typename _OutputIterator, typename _BinaryOperation>
4202 _OutputIterator
4203 transform(_InputIterator1 __first1, _InputIterator1 __last1,
4204 _InputIterator2 __first2, _OutputIterator __result,
4205 _BinaryOperation __binary_op)
4206 {
4207 // concept requirements
4208 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
4209 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
4210 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4211 // "the type returned by a _BinaryOperation"
4212 __typeof__(__binary_op(*__first1,*__first2))>)
4213 __glibcxx_requires_valid_range(__first1, __last1);
4214
4215 for (; __first1 != __last1; ++__first1, (void)++__first2, ++__result)
4216 *__result = __binary_op(*__first1, *__first2);
4217 return __result;
4218 }
4219
4220 /**
4221 * @brief Replace each occurrence of one value in a sequence with another
4222 * value.
4223 * @ingroup mutating_algorithms
4224 * @param __first A forward iterator.
4225 * @param __last A forward iterator.
4226 * @param __old_value The value to be replaced.
4227 * @param __new_value The replacement value.
4228 * @return replace() returns no value.
4229 *
4230 * For each iterator @c i in the range @p [__first,__last) if @c *i ==
4231 * @p __old_value then the assignment @c *i = @p __new_value is performed.
4232 */
4233 template<typename _ForwardIterator, typename _Tp>
4234 void
4235 replace(_ForwardIterator __first, _ForwardIterator __last,
4236 const _Tp& __old_value, const _Tp& __new_value)
4237 {
4238 // concept requirements
4239 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
4240 _ForwardIterator>)
4241 __glibcxx_function_requires(_EqualOpConcept<
4242 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
4243 __glibcxx_function_requires(_ConvertibleConcept<_Tp,
4244 typename iterator_traits<_ForwardIterator>::value_type>)
4245 __glibcxx_requires_valid_range(__first, __last);
4246
4247 for (; __first != __last; ++__first)
4248 if (*__first == __old_value)
4249 *__first = __new_value;
4250 }
4251
4252 /**
4253 * @brief Replace each value in a sequence for which a predicate returns
4254 * true with another value.
4255 * @ingroup mutating_algorithms
4256 * @param __first A forward iterator.
4257 * @param __last A forward iterator.
4258 * @param __pred A predicate.
4259 * @param __new_value The replacement value.
4260 * @return replace_if() returns no value.
4261 *
4262 * For each iterator @c i in the range @p [__first,__last) if @p __pred(*i)
4263 * is true then the assignment @c *i = @p __new_value is performed.
4264 */
4265 template<typename _ForwardIterator, typename _Predicate, typename _Tp>
4266 void
4267 replace_if(_ForwardIterator __first, _ForwardIterator __last,
4268 _Predicate __pred, const _Tp& __new_value)
4269 {
4270 // concept requirements
4271 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
4272 _ForwardIterator>)
4273 __glibcxx_function_requires(_ConvertibleConcept<_Tp,
4274 typename iterator_traits<_ForwardIterator>::value_type>)
4275 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
4276 typename iterator_traits<_ForwardIterator>::value_type>)
4277 __glibcxx_requires_valid_range(__first, __last);
4278
4279 for (; __first != __last; ++__first)
4280 if (__pred(*__first))
4281 *__first = __new_value;
4282 }
4283
4284 /**
4285 * @brief Assign the result of a function object to each value in a
4286 * sequence.
4287 * @ingroup mutating_algorithms
4288 * @param __first A forward iterator.
4289 * @param __last A forward iterator.
4290 * @param __gen A function object taking no arguments and returning
4291 * std::iterator_traits<_ForwardIterator>::value_type
4292 * @return generate() returns no value.
4293 *
4294 * Performs the assignment @c *i = @p __gen() for each @c i in the range
4295 * @p [__first,__last).
4296 */
4297 template<typename _ForwardIterator, typename _Generator>
4298 void
4299 generate(_ForwardIterator __first, _ForwardIterator __last,
4300 _Generator __gen)
4301 {
4302 // concept requirements
4303 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
4304 __glibcxx_function_requires(_GeneratorConcept<_Generator,
4305 typename iterator_traits<_ForwardIterator>::value_type>)
4306 __glibcxx_requires_valid_range(__first, __last);
4307
4308 for (; __first != __last; ++__first)
4309 *__first = __gen();
4310 }
4311
4312 /**
4313 * @brief Assign the result of a function object to each value in a
4314 * sequence.
4315 * @ingroup mutating_algorithms
4316 * @param __first A forward iterator.
4317 * @param __n The length of the sequence.
4318 * @param __gen A function object taking no arguments and returning
4319 * std::iterator_traits<_ForwardIterator>::value_type
4320 * @return The end of the sequence, @p __first+__n
4321 *
4322 * Performs the assignment @c *i = @p __gen() for each @c i in the range
4323 * @p [__first,__first+__n).
4324 *
4325 * _GLIBCXX_RESOLVE_LIB_DEFECTS
4326 * DR 865. More algorithms that throw away information
4327 */
4328 template<typename _OutputIterator, typename _Size, typename _Generator>
4329 _OutputIterator
4330 generate_n(_OutputIterator __first, _Size __n, _Generator __gen)
4331 {
4332 // concept requirements
4333 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4334 // "the type returned by a _Generator"
4335 __typeof__(__gen())>)
4336
4337 for (__decltype(__n + 0) __niter = __n;
4338 __niter > 0; --__niter, ++__first)
4339 *__first = __gen();
4340 return __first;
4341 }
4342
4343 /**
4344 * @brief Copy a sequence, removing consecutive duplicate values.
4345 * @ingroup mutating_algorithms
4346 * @param __first An input iterator.
4347 * @param __last An input iterator.
4348 * @param __result An output iterator.
4349 * @return An iterator designating the end of the resulting sequence.
4350 *
4351 * Copies each element in the range @p [__first,__last) to the range
4352 * beginning at @p __result, except that only the first element is copied
4353 * from groups of consecutive elements that compare equal.
4354 * unique_copy() is stable, so the relative order of elements that are
4355 * copied is unchanged.
4356 *
4357 * _GLIBCXX_RESOLVE_LIB_DEFECTS
4358 * DR 241. Does unique_copy() require CopyConstructible and Assignable?
4359 *
4360 * _GLIBCXX_RESOLVE_LIB_DEFECTS
4361 * DR 538. 241 again: Does unique_copy() require CopyConstructible and
4362 * Assignable?
4363 */
4364 template<typename _InputIterator, typename _OutputIterator>
4365 inline _OutputIterator
4366 unique_copy(_InputIterator __first, _InputIterator __last,
4367 _OutputIterator __result)
4368 {
4369 // concept requirements
4370 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
4371 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4372 typename iterator_traits<_InputIterator>::value_type>)
4373 __glibcxx_function_requires(_EqualityComparableConcept<
4374 typename iterator_traits<_InputIterator>::value_type>)
4375 __glibcxx_requires_valid_range(__first, __last);
4376
4377 if (__first == __last)
4378 return __result;
4379 return std::__unique_copy(__first, __last, __result,
4380 __gnu_cxx::__ops::__iter_equal_to_iter(),
4381 std::__iterator_category(__first),
4382 std::__iterator_category(__result));
4383 }
4384
4385 /**
4386 * @brief Copy a sequence, removing consecutive values using a predicate.
4387 * @ingroup mutating_algorithms
4388 * @param __first An input iterator.
4389 * @param __last An input iterator.
4390 * @param __result An output iterator.
4391 * @param __binary_pred A binary predicate.
4392 * @return An iterator designating the end of the resulting sequence.
4393 *
4394 * Copies each element in the range @p [__first,__last) to the range
4395 * beginning at @p __result, except that only the first element is copied
4396 * from groups of consecutive elements for which @p __binary_pred returns
4397 * true.
4398 * unique_copy() is stable, so the relative order of elements that are
4399 * copied is unchanged.
4400 *
4401 * _GLIBCXX_RESOLVE_LIB_DEFECTS
4402 * DR 241. Does unique_copy() require CopyConstructible and Assignable?
4403 */
4404 template<typename _InputIterator, typename _OutputIterator,
4405 typename _BinaryPredicate>
4406 inline _OutputIterator
4407 unique_copy(_InputIterator __first, _InputIterator __last,
4408 _OutputIterator __result,
4409 _BinaryPredicate __binary_pred)
4410 {
4411 // concept requirements -- predicates checked later
4412 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
4413 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4414 typename iterator_traits<_InputIterator>::value_type>)
4415 __glibcxx_requires_valid_range(__first, __last);
4416
4417 if (__first == __last)
4418 return __result;
4419 return std::__unique_copy(__first, __last, __result,
4420 __gnu_cxx::__ops::__iter_comp_iter(__binary_pred),
4421 std::__iterator_category(__first),
4422 std::__iterator_category(__result));
4423 }
4424
4425#if _GLIBCXX_HOSTED1
4426 /**
4427 * @brief Randomly shuffle the elements of a sequence.
4428 * @ingroup mutating_algorithms
4429 * @param __first A forward iterator.
4430 * @param __last A forward iterator.
4431 * @return Nothing.
4432 *
4433 * Reorder the elements in the range @p [__first,__last) using a random
4434 * distribution, so that every possible ordering of the sequence is
4435 * equally likely.
4436 */
4437 template<typename _RandomAccessIterator>
4438 inline void
4439 random_shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last)
4440 {
4441 // concept requirements
4442 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4443 _RandomAccessIterator>)
4444 __glibcxx_requires_valid_range(__first, __last);
4445
4446 if (__first != __last)
4447 for (_RandomAccessIterator __i = __first + 1; __i != __last; ++__i)
4448 {
4449 // XXX rand() % N is not uniformly distributed
4450 _RandomAccessIterator __j = __first
4451 + std::rand() % ((__i - __first) + 1);
4452 if (__i != __j)
4453 std::iter_swap(__i, __j);
4454 }
4455 }
4456#endif
4457
4458 /**
4459 * @brief Shuffle the elements of a sequence using a random number
4460 * generator.
4461 * @ingroup mutating_algorithms
4462 * @param __first A forward iterator.
4463 * @param __last A forward iterator.
4464 * @param __rand The RNG functor or function.
4465 * @return Nothing.
4466 *
4467 * Reorders the elements in the range @p [__first,__last) using @p __rand to
4468 * provide a random distribution. Calling @p __rand(N) for a positive
4469 * integer @p N should return a randomly chosen integer from the
4470 * range [0,N).
4471 */
4472 template<typename _RandomAccessIterator, typename _RandomNumberGenerator>
4473 void
4474 random_shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last,
4475#if __cplusplus201402L >= 201103L
4476 _RandomNumberGenerator&& __rand)
4477#else
4478 _RandomNumberGenerator& __rand)
4479#endif
4480 {
4481 // concept requirements
4482 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4483 _RandomAccessIterator>)
4484 __glibcxx_requires_valid_range(__first, __last);
4485
4486 if (__first == __last)
4487 return;
4488 for (_RandomAccessIterator __i = __first + 1; __i != __last; ++__i)
4489 {
4490 _RandomAccessIterator __j = __first + __rand((__i - __first) + 1);
4491 if (__i != __j)
4492 std::iter_swap(__i, __j);
4493 }
4494 }
4495
4496
4497 /**
4498 * @brief Move elements for which a predicate is true to the beginning
4499 * of a sequence.
4500 * @ingroup mutating_algorithms
4501 * @param __first A forward iterator.
4502 * @param __last A forward iterator.
4503 * @param __pred A predicate functor.
4504 * @return An iterator @p middle such that @p __pred(i) is true for each
4505 * iterator @p i in the range @p [__first,middle) and false for each @p i
4506 * in the range @p [middle,__last).
4507 *
4508 * @p __pred must not modify its operand. @p partition() does not preserve
4509 * the relative ordering of elements in each group, use
4510 * @p stable_partition() if this is needed.
4511 */
4512 template<typename _ForwardIterator, typename _Predicate>
4513 inline _ForwardIterator
4514 partition(_ForwardIterator __first, _ForwardIterator __last,
4515 _Predicate __pred)
4516 {
4517 // concept requirements
4518 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
4519 _ForwardIterator>)
4520 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
4521 typename iterator_traits<_ForwardIterator>::value_type>)
4522 __glibcxx_requires_valid_range(__first, __last);
4523
4524 return std::__partition(__first, __last, __pred,
4525 std::__iterator_category(__first));
4526 }
4527
4528
4529 /**
4530 * @brief Sort the smallest elements of a sequence.
4531 * @ingroup sorting_algorithms
4532 * @param __first An iterator.
4533 * @param __middle Another iterator.
4534 * @param __last Another iterator.
4535 * @return Nothing.
4536 *
4537 * Sorts the smallest @p (__middle-__first) elements in the range
4538 * @p [first,last) and moves them to the range @p [__first,__middle). The
4539 * order of the remaining elements in the range @p [__middle,__last) is
4540 * undefined.
4541 * After the sort if @e i and @e j are iterators in the range
4542 * @p [__first,__middle) such that i precedes j and @e k is an iterator in
4543 * the range @p [__middle,__last) then *j<*i and *k<*i are both false.
4544 */
4545 template<typename _RandomAccessIterator>
4546 inline void
4547 partial_sort(_RandomAccessIterator __first,
4548 _RandomAccessIterator __middle,
4549 _RandomAccessIterator __last)
4550 {
4551 // concept requirements
4552 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4553 _RandomAccessIterator>)
4554 __glibcxx_function_requires(_LessThanComparableConcept<
4555 typename iterator_traits<_RandomAccessIterator>::value_type>)
4556 __glibcxx_requires_valid_range(__first, __middle);
4557 __glibcxx_requires_valid_range(__middle, __last);
4558 __glibcxx_requires_irreflexive(__first, __last);
4559
4560 std::__partial_sort(__first, __middle, __last,
4561 __gnu_cxx::__ops::__iter_less_iter());
4562 }
4563
4564 /**
4565 * @brief Sort the smallest elements of a sequence using a predicate
4566 * for comparison.
4567 * @ingroup sorting_algorithms
4568 * @param __first An iterator.
4569 * @param __middle Another iterator.
4570 * @param __last Another iterator.
4571 * @param __comp A comparison functor.
4572 * @return Nothing.
4573 *
4574 * Sorts the smallest @p (__middle-__first) elements in the range
4575 * @p [__first,__last) and moves them to the range @p [__first,__middle). The
4576 * order of the remaining elements in the range @p [__middle,__last) is
4577 * undefined.
4578 * After the sort if @e i and @e j are iterators in the range
4579 * @p [__first,__middle) such that i precedes j and @e k is an iterator in
4580 * the range @p [__middle,__last) then @p *__comp(j,*i) and @p __comp(*k,*i)
4581 * are both false.
4582 */
4583 template<typename _RandomAccessIterator, typename _Compare>
4584 inline void
4585 partial_sort(_RandomAccessIterator __first,
4586 _RandomAccessIterator __middle,
4587 _RandomAccessIterator __last,
4588 _Compare __comp)
4589 {
4590 // concept requirements
4591 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4592 _RandomAccessIterator>)
4593 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
4594 typename iterator_traits<_RandomAccessIterator>::value_type,
4595 typename iterator_traits<_RandomAccessIterator>::value_type>)
4596 __glibcxx_requires_valid_range(__first, __middle);
4597 __glibcxx_requires_valid_range(__middle, __last);
4598 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
4599
4600 std::__partial_sort(__first, __middle, __last,
4601 __gnu_cxx::__ops::__iter_comp_iter(__comp));
4602 }
4603
4604 /**
4605 * @brief Sort a sequence just enough to find a particular position.
4606 * @ingroup sorting_algorithms
4607 * @param __first An iterator.
4608 * @param __nth Another iterator.
4609 * @param __last Another iterator.
4610 * @return Nothing.
4611 *
4612 * Rearranges the elements in the range @p [__first,__last) so that @p *__nth
4613 * is the same element that would have been in that position had the
4614 * whole sequence been sorted. The elements either side of @p *__nth are
4615 * not completely sorted, but for any iterator @e i in the range
4616 * @p [__first,__nth) and any iterator @e j in the range @p [__nth,__last) it
4617 * holds that *j < *i is false.
4618 */
4619 template<typename _RandomAccessIterator>
4620 inline void
4621 nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth,
4622 _RandomAccessIterator __last)
4623 {
4624 // concept requirements
4625 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4626 _RandomAccessIterator>)
4627 __glibcxx_function_requires(_LessThanComparableConcept<
4628 typename iterator_traits<_RandomAccessIterator>::value_type>)
4629 __glibcxx_requires_valid_range(__first, __nth);
4630 __glibcxx_requires_valid_range(__nth, __last);
4631 __glibcxx_requires_irreflexive(__first, __last);
4632
4633 if (__first == __last || __nth == __last)
4634 return;
4635
4636 std::__introselect(__first, __nth, __last,
4637 std::__lg(__last - __first) * 2,
4638 __gnu_cxx::__ops::__iter_less_iter());
4639 }
4640
4641 /**
4642 * @brief Sort a sequence just enough to find a particular position
4643 * using a predicate for comparison.
4644 * @ingroup sorting_algorithms
4645 * @param __first An iterator.
4646 * @param __nth Another iterator.
4647 * @param __last Another iterator.
4648 * @param __comp A comparison functor.
4649 * @return Nothing.
4650 *
4651 * Rearranges the elements in the range @p [__first,__last) so that @p *__nth
4652 * is the same element that would have been in that position had the
4653 * whole sequence been sorted. The elements either side of @p *__nth are
4654 * not completely sorted, but for any iterator @e i in the range
4655 * @p [__first,__nth) and any iterator @e j in the range @p [__nth,__last) it
4656 * holds that @p __comp(*j,*i) is false.
4657 */
4658 template<typename _RandomAccessIterator, typename _Compare>
4659 inline void
4660 nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth,
4661 _RandomAccessIterator __last, _Compare __comp)
4662 {
4663 // concept requirements
4664 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4665 _RandomAccessIterator>)
4666 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
4667 typename iterator_traits<_RandomAccessIterator>::value_type,
4668 typename iterator_traits<_RandomAccessIterator>::value_type>)
4669 __glibcxx_requires_valid_range(__first, __nth);
4670 __glibcxx_requires_valid_range(__nth, __last);
4671 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
4672
4673 if (__first == __last || __nth == __last)
4674 return;
4675
4676 std::__introselect(__first, __nth, __last,
4677 std::__lg(__last - __first) * 2,
4678 __gnu_cxx::__ops::__iter_comp_iter(__comp));
4679 }
4680
4681 /**
4682 * @brief Sort the elements of a sequence.
4683 * @ingroup sorting_algorithms
4684 * @param __first An iterator.
4685 * @param __last Another iterator.
4686 * @return Nothing.
4687 *
4688 * Sorts the elements in the range @p [__first,__last) in ascending order,
4689 * such that for each iterator @e i in the range @p [__first,__last-1),
4690 * *(i+1)<*i is false.
4691 *
4692 * The relative ordering of equivalent elements is not preserved, use
4693 * @p stable_sort() if this is needed.
4694 */
4695 template<typename _RandomAccessIterator>
4696 inline void
4697 sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
4698 {
4699 // concept requirements
4700 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4701 _RandomAccessIterator>)
4702 __glibcxx_function_requires(_LessThanComparableConcept<
4703 typename iterator_traits<_RandomAccessIterator>::value_type>)
4704 __glibcxx_requires_valid_range(__first, __last);
4705 __glibcxx_requires_irreflexive(__first, __last);
4706
4707 std::__sort(__first, __last, __gnu_cxx::__ops::__iter_less_iter());
4708 }
4709
4710 /**
4711 * @brief Sort the elements of a sequence using a predicate for comparison.
4712 * @ingroup sorting_algorithms
4713 * @param __first An iterator.
4714 * @param __last Another iterator.
4715 * @param __comp A comparison functor.
4716 * @return Nothing.
4717 *
4718 * Sorts the elements in the range @p [__first,__last) in ascending order,
4719 * such that @p __comp(*(i+1),*i) is false for every iterator @e i in the
4720 * range @p [__first,__last-1).
4721 *
4722 * The relative ordering of equivalent elements is not preserved, use
4723 * @p stable_sort() if this is needed.
4724 */
4725 template<typename _RandomAccessIterator, typename _Compare>
4726 inline void
4727 sort(_RandomAccessIterator __first, _RandomAccessIterator __last,
4728 _Compare __comp)
4729 {
4730 // concept requirements
4731 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4732 _RandomAccessIterator>)
4733 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
4734 typename iterator_traits<_RandomAccessIterator>::value_type,
4735 typename iterator_traits<_RandomAccessIterator>::value_type>)
4736 __glibcxx_requires_valid_range(__first, __last);
4737 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
4738
4739 std::__sort(__first, __last, __gnu_cxx::__ops::__iter_comp_iter(__comp));
4740 }
4741
4742 template<typename _InputIterator1, typename _InputIterator2,
4743 typename _OutputIterator, typename _Compare>
4744 _OutputIterator
4745 __merge(_InputIterator1 __first1, _InputIterator1 __last1,
4746 _InputIterator2 __first2, _InputIterator2 __last2,
4747 _OutputIterator __result, _Compare __comp)
4748 {
4749 while (__first1 != __last1 && __first2 != __last2)
4750 {
4751 if (__comp(__first2, __first1))
4752 {
4753 *__result = *__first2;
4754 ++__first2;
4755 }
4756 else
4757 {
4758 *__result = *__first1;
4759 ++__first1;
4760 }
4761 ++__result;
4762 }
4763 return std::copy(__first2, __last2,
4764 std::copy(__first1, __last1, __result));
4765 }
4766
4767 /**
4768 * @brief Merges two sorted ranges.
4769 * @ingroup sorting_algorithms
4770 * @param __first1 An iterator.
4771 * @param __first2 Another iterator.
4772 * @param __last1 Another iterator.
4773 * @param __last2 Another iterator.
4774 * @param __result An iterator pointing to the end of the merged range.
4775 * @return An iterator pointing to the first element <em>not less
4776 * than</em> @e val.
4777 *
4778 * Merges the ranges @p [__first1,__last1) and @p [__first2,__last2) into
4779 * the sorted range @p [__result, __result + (__last1-__first1) +
4780 * (__last2-__first2)). Both input ranges must be sorted, and the
4781 * output range must not overlap with either of the input ranges.
4782 * The sort is @e stable, that is, for equivalent elements in the
4783 * two ranges, elements from the first range will always come
4784 * before elements from the second.
4785 */
4786 template<typename _InputIterator1, typename _InputIterator2,
4787 typename _OutputIterator>
4788 inline _OutputIterator
4789 merge(_InputIterator1 __first1, _InputIterator1 __last1,
4790 _InputIterator2 __first2, _InputIterator2 __last2,
4791 _OutputIterator __result)
4792 {
4793 // concept requirements
4794 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
4795 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
4796 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4797 typename iterator_traits<_InputIterator1>::value_type>)
4798 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4799 typename iterator_traits<_InputIterator2>::value_type>)
4800 __glibcxx_function_requires(_LessThanOpConcept<
4801 typename iterator_traits<_InputIterator2>::value_type,
4802 typename iterator_traits<_InputIterator1>::value_type>)
4803 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
4804 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
4805 __glibcxx_requires_irreflexive2(__first1, __last1);
4806 __glibcxx_requires_irreflexive2(__first2, __last2);
4807
4808 return _GLIBCXX_STD_Astd::__merge(__first1, __last1,
4809 __first2, __last2, __result,
4810 __gnu_cxx::__ops::__iter_less_iter());
4811 }
4812
4813 /**
4814 * @brief Merges two sorted ranges.
4815 * @ingroup sorting_algorithms
4816 * @param __first1 An iterator.
4817 * @param __first2 Another iterator.
4818 * @param __last1 Another iterator.
4819 * @param __last2 Another iterator.
4820 * @param __result An iterator pointing to the end of the merged range.
4821 * @param __comp A functor to use for comparisons.
4822 * @return An iterator pointing to the first element "not less
4823 * than" @e val.
4824 *
4825 * Merges the ranges @p [__first1,__last1) and @p [__first2,__last2) into
4826 * the sorted range @p [__result, __result + (__last1-__first1) +
4827 * (__last2-__first2)). Both input ranges must be sorted, and the
4828 * output range must not overlap with either of the input ranges.
4829 * The sort is @e stable, that is, for equivalent elements in the
4830 * two ranges, elements from the first range will always come
4831 * before elements from the second.
4832 *
4833 * The comparison function should have the same effects on ordering as
4834 * the function used for the initial sort.
4835 */
4836 template<typename _InputIterator1, typename _InputIterator2,
4837 typename _OutputIterator, typename _Compare>
4838 inline _OutputIterator
4839 merge(_InputIterator1 __first1, _InputIterator1 __last1,
4840 _InputIterator2 __first2, _InputIterator2 __last2,
4841 _OutputIterator __result, _Compare __comp)
4842 {
4843 // concept requirements
4844 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
4845 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
4846 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4847 typename iterator_traits<_InputIterator1>::value_type>)
4848 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4849 typename iterator_traits<_InputIterator2>::value_type>)
4850 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
4851 typename iterator_traits<_InputIterator2>::value_type,
4852 typename iterator_traits<_InputIterator1>::value_type>)
4853 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
4854 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
4855 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
4856 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
4857
4858 return _GLIBCXX_STD_Astd::__merge(__first1, __last1,
4859 __first2, __last2, __result,
4860 __gnu_cxx::__ops::__iter_comp_iter(__comp));
4861 }
4862
4863 template<typename _RandomAccessIterator, typename _Compare>
4864 inline void
4865 __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last,
4866 _Compare __comp)
4867 {
4868 typedef typename iterator_traits<_RandomAccessIterator>::value_type
4869 _ValueType;
4870 typedef typename iterator_traits<_RandomAccessIterator>::difference_type
4871 _DistanceType;
4872
4873 typedef _Temporary_buffer<_RandomAccessIterator, _ValueType> _TmpBuf;
4874 _TmpBuf __buf(__first, __last);
4875
4876 if (__buf.begin() == 0)
4877 std::__inplace_stable_sort(__first, __last, __comp);
4878 else
4879 std::__stable_sort_adaptive(__first, __last, __buf.begin(),
4880 _DistanceType(__buf.size()), __comp);
4881 }
4882
4883 /**
4884 * @brief Sort the elements of a sequence, preserving the relative order
4885 * of equivalent elements.
4886 * @ingroup sorting_algorithms
4887 * @param __first An iterator.
4888 * @param __last Another iterator.
4889 * @return Nothing.
4890 *
4891 * Sorts the elements in the range @p [__first,__last) in ascending order,
4892 * such that for each iterator @p i in the range @p [__first,__last-1),
4893 * @p *(i+1)<*i is false.
4894 *
4895 * The relative ordering of equivalent elements is preserved, so any two
4896 * elements @p x and @p y in the range @p [__first,__last) such that
4897 * @p x<y is false and @p y<x is false will have the same relative
4898 * ordering after calling @p stable_sort().
4899 */
4900 template<typename _RandomAccessIterator>
4901 inline void
4902 stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
4903 {
4904 // concept requirements
4905 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4906 _RandomAccessIterator>)
4907 __glibcxx_function_requires(_LessThanComparableConcept<
4908 typename iterator_traits<_RandomAccessIterator>::value_type>)
4909 __glibcxx_requires_valid_range(__first, __last);
4910 __glibcxx_requires_irreflexive(__first, __last);
4911
4912 _GLIBCXX_STD_Astd::__stable_sort(__first, __last,
4913 __gnu_cxx::__ops::__iter_less_iter());
4914 }
4915
4916 /**
4917 * @brief Sort the elements of a sequence using a predicate for comparison,
4918 * preserving the relative order of equivalent elements.
4919 * @ingroup sorting_algorithms
4920 * @param __first An iterator.
4921 * @param __last Another iterator.
4922 * @param __comp A comparison functor.
4923 * @return Nothing.
4924 *
4925 * Sorts the elements in the range @p [__first,__last) in ascending order,
4926 * such that for each iterator @p i in the range @p [__first,__last-1),
4927 * @p __comp(*(i+1),*i) is false.
4928 *
4929 * The relative ordering of equivalent elements is preserved, so any two
4930 * elements @p x and @p y in the range @p [__first,__last) such that
4931 * @p __comp(x,y) is false and @p __comp(y,x) is false will have the same
4932 * relative ordering after calling @p stable_sort().
4933 */
4934 template<typename _RandomAccessIterator, typename _Compare>
4935 inline void
4936 stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last,
4937 _Compare __comp)
4938 {
4939 // concept requirements
4940 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4941 _RandomAccessIterator>)
4942 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
4943 typename iterator_traits<_RandomAccessIterator>::value_type,
4944 typename iterator_traits<_RandomAccessIterator>::value_type>)
4945 __glibcxx_requires_valid_range(__first, __last);
4946 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
4947
4948 _GLIBCXX_STD_Astd::__stable_sort(__first, __last,
4949 __gnu_cxx::__ops::__iter_comp_iter(__comp));
4950 }
4951
4952 template<typename _InputIterator1, typename _InputIterator2,
4953 typename _OutputIterator,
4954 typename _Compare>
4955 _OutputIterator
4956 __set_union(_InputIterator1 __first1, _InputIterator1 __last1,
4957 _InputIterator2 __first2, _InputIterator2 __last2,
4958 _OutputIterator __result, _Compare __comp)
4959 {
4960 while (__first1 != __last1 && __first2 != __last2)
4961 {
4962 if (__comp(__first1, __first2))
4963 {
4964 *__result = *__first1;
4965 ++__first1;
4966 }
4967 else if (__comp(__first2, __first1))
4968 {
4969 *__result = *__first2;
4970 ++__first2;
4971 }
4972 else
4973 {
4974 *__result = *__first1;
4975 ++__first1;
4976 ++__first2;
4977 }
4978 ++__result;
4979 }
4980 return std::copy(__first2, __last2,
4981 std::copy(__first1, __last1, __result));
4982 }
4983
4984 /**
4985 * @brief Return the union of two sorted ranges.
4986 * @ingroup set_algorithms
4987 * @param __first1 Start of first range.
4988 * @param __last1 End of first range.
4989 * @param __first2 Start of second range.
4990 * @param __last2 End of second range.
4991 * @return End of the output range.
4992 * @ingroup set_algorithms
4993 *
4994 * This operation iterates over both ranges, copying elements present in
4995 * each range in order to the output range. Iterators increment for each
4996 * range. When the current element of one range is less than the other,
4997 * that element is copied and the iterator advanced. If an element is
4998 * contained in both ranges, the element from the first range is copied and
4999 * both ranges advance. The output range may not overlap either input
5000 * range.
5001 */
5002 template<typename _InputIterator1, typename _InputIterator2,
5003 typename _OutputIterator>
5004 inline _OutputIterator
5005 set_union(_InputIterator1 __first1, _InputIterator1 __last1,
5006 _InputIterator2 __first2, _InputIterator2 __last2,
5007 _OutputIterator __result)
5008 {
5009 // concept requirements
5010 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5011 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5012 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5013 typename iterator_traits<_InputIterator1>::value_type>)
5014 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5015 typename iterator_traits<_InputIterator2>::value_type>)
5016 __glibcxx_function_requires(_LessThanOpConcept<
5017 typename iterator_traits<_InputIterator1>::value_type,
5018 typename iterator_traits<_InputIterator2>::value_type>)
5019 __glibcxx_function_requires(_LessThanOpConcept<
5020 typename iterator_traits<_InputIterator2>::value_type,
5021 typename iterator_traits<_InputIterator1>::value_type>)
5022 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
5023 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
5024 __glibcxx_requires_irreflexive2(__first1, __last1);
5025 __glibcxx_requires_irreflexive2(__first2, __last2);
5026
5027 return _GLIBCXX_STD_Astd::__set_union(__first1, __last1,
5028 __first2, __last2, __result,
5029 __gnu_cxx::__ops::__iter_less_iter());
5030 }
5031
5032 /**
5033 * @brief Return the union of two sorted ranges using a comparison functor.
5034 * @ingroup set_algorithms
5035 * @param __first1 Start of first range.
5036 * @param __last1 End of first range.
5037 * @param __first2 Start of second range.
5038 * @param __last2 End of second range.
5039 * @param __comp The comparison functor.
5040 * @return End of the output range.
5041 * @ingroup set_algorithms
5042 *
5043 * This operation iterates over both ranges, copying elements present in
5044 * each range in order to the output range. Iterators increment for each
5045 * range. When the current element of one range is less than the other
5046 * according to @p __comp, that element is copied and the iterator advanced.
5047 * If an equivalent element according to @p __comp is contained in both
5048 * ranges, the element from the first range is copied and both ranges
5049 * advance. The output range may not overlap either input range.
5050 */
5051 template<typename _InputIterator1, typename _InputIterator2,
5052 typename _OutputIterator, typename _Compare>
5053 inline _OutputIterator
5054 set_union(_InputIterator1 __first1, _InputIterator1 __last1,
5055 _InputIterator2 __first2, _InputIterator2 __last2,
5056 _OutputIterator __result, _Compare __comp)
5057 {
5058 // concept requirements
5059 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5060 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5061 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5062 typename iterator_traits<_InputIterator1>::value_type>)
5063 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5064 typename iterator_traits<_InputIterator2>::value_type>)
5065 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5066 typename iterator_traits<_InputIterator1>::value_type,
5067 typename iterator_traits<_InputIterator2>::value_type>)
5068 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5069 typename iterator_traits<_InputIterator2>::value_type,
5070 typename iterator_traits<_InputIterator1>::value_type>)
5071 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
5072 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
5073 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
5074 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
5075
5076 return _GLIBCXX_STD_Astd::__set_union(__first1, __last1,
5077 __first2, __last2, __result,
5078 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5079 }
5080
5081 template<typename _InputIterator1, typename _InputIterator2,
5082 typename _OutputIterator,
5083 typename _Compare>
5084 _OutputIterator
5085 __set_intersection(_InputIterator1 __first1, _InputIterator1 __last1,
5086 _InputIterator2 __first2, _InputIterator2 __last2,
5087 _OutputIterator __result, _Compare __comp)
5088 {
5089 while (__first1 != __last1 && __first2 != __last2)
5090 if (__comp(__first1, __first2))
5091 ++__first1;
5092 else if (__comp(__first2, __first1))
5093 ++__first2;
5094 else
5095 {
5096 *__result = *__first1;
5097 ++__first1;
5098 ++__first2;
5099 ++__result;
5100 }
5101 return __result;
5102 }
5103
5104 /**
5105 * @brief Return the intersection of two sorted ranges.
5106 * @ingroup set_algorithms
5107 * @param __first1 Start of first range.
5108 * @param __last1 End of first range.
5109 * @param __first2 Start of second range.
5110 * @param __last2 End of second range.
5111 * @return End of the output range.
5112 * @ingroup set_algorithms
5113 *
5114 * This operation iterates over both ranges, copying elements present in
5115 * both ranges in order to the output range. Iterators increment for each
5116 * range. When the current element of one range is less than the other,
5117 * that iterator advances. If an element is contained in both ranges, the
5118 * element from the first range is copied and both ranges advance. The
5119 * output range may not overlap either input range.
5120 */
5121 template<typename _InputIterator1, typename _InputIterator2,
5122 typename _OutputIterator>
5123 inline _OutputIterator
5124 set_intersection(_InputIterator1 __first1, _InputIterator1 __last1,
5125 _InputIterator2 __first2, _InputIterator2 __last2,
5126 _OutputIterator __result)
5127 {
5128 // concept requirements
5129 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5130 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5131 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5132 typename iterator_traits<_InputIterator1>::value_type>)
5133 __glibcxx_function_requires(_LessThanOpConcept<
5134 typename iterator_traits<_InputIterator1>::value_type,
5135 typename iterator_traits<_InputIterator2>::value_type>)
5136 __glibcxx_function_requires(_LessThanOpConcept<
5137 typename iterator_traits<_InputIterator2>::value_type,
5138 typename iterator_traits<_InputIterator1>::value_type>)
5139 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
5140 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
5141 __glibcxx_requires_irreflexive2(__first1, __last1);
5142 __glibcxx_requires_irreflexive2(__first2, __last2);
5143
5144 return _GLIBCXX_STD_Astd::__set_intersection(__first1, __last1,
5145 __first2, __last2, __result,
5146 __gnu_cxx::__ops::__iter_less_iter());
5147 }
5148
5149 /**
5150 * @brief Return the intersection of two sorted ranges using comparison
5151 * functor.
5152 * @ingroup set_algorithms
5153 * @param __first1 Start of first range.
5154 * @param __last1 End of first range.
5155 * @param __first2 Start of second range.
5156 * @param __last2 End of second range.
5157 * @param __comp The comparison functor.
5158 * @return End of the output range.
5159 * @ingroup set_algorithms
5160 *
5161 * This operation iterates over both ranges, copying elements present in
5162 * both ranges in order to the output range. Iterators increment for each
5163 * range. When the current element of one range is less than the other
5164 * according to @p __comp, that iterator advances. If an element is
5165 * contained in both ranges according to @p __comp, the element from the
5166 * first range is copied and both ranges advance. The output range may not
5167 * overlap either input range.
5168 */
5169 template<typename _InputIterator1, typename _InputIterator2,
5170 typename _OutputIterator, typename _Compare>
5171 inline _OutputIterator
5172 set_intersection(_InputIterator1 __first1, _InputIterator1 __last1,
5173 _InputIterator2 __first2, _InputIterator2 __last2,
5174 _OutputIterator __result, _Compare __comp)
5175 {
5176 // concept requirements
5177 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5178 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5179 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5180 typename iterator_traits<_InputIterator1>::value_type>)
5181 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5182 typename iterator_traits<_InputIterator1>::value_type,
5183 typename iterator_traits<_InputIterator2>::value_type>)
5184 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5185 typename iterator_traits<_InputIterator2>::value_type,
5186 typename iterator_traits<_InputIterator1>::value_type>)
5187 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
5188 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
5189 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
5190 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
5191
5192 return _GLIBCXX_STD_Astd::__set_intersection(__first1, __last1,
5193 __first2, __last2, __result,
5194 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5195 }
5196
5197 template<typename _InputIterator1, typename _InputIterator2,
5198 typename _OutputIterator,
5199 typename _Compare>
5200 _OutputIterator
5201 __set_difference(_InputIterator1 __first1, _InputIterator1 __last1,
5202 _InputIterator2 __first2, _InputIterator2 __last2,
5203 _OutputIterator __result, _Compare __comp)
5204 {
5205 while (__first1 != __last1 && __first2 != __last2)
5206 if (__comp(__first1, __first2))
5207 {
5208 *__result = *__first1;
5209 ++__first1;
5210 ++__result;
5211 }
5212 else if (__comp(__first2, __first1))
5213 ++__first2;
5214 else
5215 {
5216 ++__first1;
5217 ++__first2;
5218 }
5219 return std::copy(__first1, __last1, __result);
5220 }
5221
5222 /**
5223 * @brief Return the difference of two sorted ranges.
5224 * @ingroup set_algorithms
5225 * @param __first1 Start of first range.
5226 * @param __last1 End of first range.
5227 * @param __first2 Start of second range.
5228 * @param __last2 End of second range.
5229 * @return End of the output range.
5230 * @ingroup set_algorithms
5231 *
5232 * This operation iterates over both ranges, copying elements present in
5233 * the first range but not the second in order to the output range.
5234 * Iterators increment for each range. When the current element of the
5235 * first range is less than the second, that element is copied and the
5236 * iterator advances. If the current element of the second range is less,
5237 * the iterator advances, but no element is copied. If an element is
5238 * contained in both ranges, no elements are copied and both ranges
5239 * advance. The output range may not overlap either input range.
5240 */
5241 template<typename _InputIterator1, typename _InputIterator2,
5242 typename _OutputIterator>
5243 inline _OutputIterator
5244 set_difference(_InputIterator1 __first1, _InputIterator1 __last1,
5245 _InputIterator2 __first2, _InputIterator2 __last2,
5246 _OutputIterator __result)
5247 {
5248 // concept requirements
5249 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5250 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5251 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5252 typename iterator_traits<_InputIterator1>::value_type>)
5253 __glibcxx_function_requires(_LessThanOpConcept<
5254 typename iterator_traits<_InputIterator1>::value_type,
5255 typename iterator_traits<_InputIterator2>::value_type>)
5256 __glibcxx_function_requires(_LessThanOpConcept<
5257 typename iterator_traits<_InputIterator2>::value_type,
5258 typename iterator_traits<_InputIterator1>::value_type>)
5259 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
5260 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
5261 __glibcxx_requires_irreflexive2(__first1, __last1);
5262 __glibcxx_requires_irreflexive2(__first2, __last2);
5263
5264 return _GLIBCXX_STD_Astd::__set_difference(__first1, __last1,
5265 __first2, __last2, __result,
5266 __gnu_cxx::__ops::__iter_less_iter());
5267 }
5268
5269 /**
5270 * @brief Return the difference of two sorted ranges using comparison
5271 * functor.
5272 * @ingroup set_algorithms
5273 * @param __first1 Start of first range.
5274 * @param __last1 End of first range.
5275 * @param __first2 Start of second range.
5276 * @param __last2 End of second range.
5277 * @param __comp The comparison functor.
5278 * @return End of the output range.
5279 * @ingroup set_algorithms
5280 *
5281 * This operation iterates over both ranges, copying elements present in
5282 * the first range but not the second in order to the output range.
5283 * Iterators increment for each range. When the current element of the
5284 * first range is less than the second according to @p __comp, that element
5285 * is copied and the iterator advances. If the current element of the
5286 * second range is less, no element is copied and the iterator advances.
5287 * If an element is contained in both ranges according to @p __comp, no
5288 * elements are copied and both ranges advance. The output range may not
5289 * overlap either input range.
5290 */
5291 template<typename _InputIterator1, typename _InputIterator2,
5292 typename _OutputIterator, typename _Compare>
5293 inline _OutputIterator
5294 set_difference(_InputIterator1 __first1, _InputIterator1 __last1,
5295 _InputIterator2 __first2, _InputIterator2 __last2,
5296 _OutputIterator __result, _Compare __comp)
5297 {
5298 // concept requirements
5299 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5300 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5301 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5302 typename iterator_traits<_InputIterator1>::value_type>)
5303 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5304 typename iterator_traits<_InputIterator1>::value_type,
5305 typename iterator_traits<_InputIterator2>::value_type>)
5306 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5307 typename iterator_traits<_InputIterator2>::value_type,
5308 typename iterator_traits<_InputIterator1>::value_type>)
5309 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
5310 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
5311 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
5312 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
5313
5314 return _GLIBCXX_STD_Astd::__set_difference(__first1, __last1,
5315 __first2, __last2, __result,
5316 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5317 }
5318
5319 template<typename _InputIterator1, typename _InputIterator2,
5320 typename _OutputIterator,
5321 typename _Compare>
5322 _OutputIterator
5323 __set_symmetric_difference(_InputIterator1 __first1,
5324 _InputIterator1 __last1,
5325 _InputIterator2 __first2,
5326 _InputIterator2 __last2,
5327 _OutputIterator __result,
5328 _Compare __comp)
5329 {
5330 while (__first1 != __last1 && __first2 != __last2)
5331 if (__comp(__first1, __first2))
5332 {
5333 *__result = *__first1;
5334 ++__first1;
5335 ++__result;
5336 }
5337 else if (__comp(__first2, __first1))
5338 {
5339 *__result = *__first2;
5340 ++__first2;
5341 ++__result;
5342 }
5343 else
5344 {
5345 ++__first1;
5346 ++__first2;
5347 }
5348 return std::copy(__first2, __last2,
5349 std::copy(__first1, __last1, __result));
5350 }
5351
5352 /**
5353 * @brief Return the symmetric difference of two sorted ranges.
5354 * @ingroup set_algorithms
5355 * @param __first1 Start of first range.
5356 * @param __last1 End of first range.
5357 * @param __first2 Start of second range.
5358 * @param __last2 End of second range.
5359 * @return End of the output range.
5360 * @ingroup set_algorithms
5361 *
5362 * This operation iterates over both ranges, copying elements present in
5363 * one range but not the other in order to the output range. Iterators
5364 * increment for each range. When the current element of one range is less
5365 * than the other, that element is copied and the iterator advances. If an
5366 * element is contained in both ranges, no elements are copied and both
5367 * ranges advance. The output range may not overlap either input range.
5368 */
5369 template<typename _InputIterator1, typename _InputIterator2,
5370 typename _OutputIterator>
5371 inline _OutputIterator
5372 set_symmetric_difference(_InputIterator1 __first1, _InputIterator1 __last1,
5373 _InputIterator2 __first2, _InputIterator2 __last2,
5374 _OutputIterator __result)
5375 {
5376 // concept requirements
5377 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5378 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5379 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5380 typename iterator_traits<_InputIterator1>::value_type>)
5381 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5382 typename iterator_traits<_InputIterator2>::value_type>)
5383 __glibcxx_function_requires(_LessThanOpConcept<
5384 typename iterator_traits<_InputIterator1>::value_type,
5385 typename iterator_traits<_InputIterator2>::value_type>)
5386 __glibcxx_function_requires(_LessThanOpConcept<
5387 typename iterator_traits<_InputIterator2>::value_type,
5388 typename iterator_traits<_InputIterator1>::value_type>)
5389 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
5390 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
5391 __glibcxx_requires_irreflexive2(__first1, __last1);
5392 __glibcxx_requires_irreflexive2(__first2, __last2);
5393
5394 return _GLIBCXX_STD_Astd::__set_symmetric_difference(__first1, __last1,
5395 __first2, __last2, __result,
5396 __gnu_cxx::__ops::__iter_less_iter());
5397 }
5398
5399 /**
5400 * @brief Return the symmetric difference of two sorted ranges using
5401 * comparison functor.
5402 * @ingroup set_algorithms
5403 * @param __first1 Start of first range.
5404 * @param __last1 End of first range.
5405 * @param __first2 Start of second range.
5406 * @param __last2 End of second range.
5407 * @param __comp The comparison functor.
5408 * @return End of the output range.
5409 * @ingroup set_algorithms
5410 *
5411 * This operation iterates over both ranges, copying elements present in
5412 * one range but not the other in order to the output range. Iterators
5413 * increment for each range. When the current element of one range is less
5414 * than the other according to @p comp, that element is copied and the
5415 * iterator advances. If an element is contained in both ranges according
5416 * to @p __comp, no elements are copied and both ranges advance. The output
5417 * range may not overlap either input range.
5418 */
5419 template<typename _InputIterator1, typename _InputIterator2,
5420 typename _OutputIterator, typename _Compare>
5421 inline _OutputIterator
5422 set_symmetric_difference(_InputIterator1 __first1, _InputIterator1 __last1,
5423 _InputIterator2 __first2, _InputIterator2 __last2,
5424 _OutputIterator __result,
5425 _Compare __comp)
5426 {
5427 // concept requirements
5428 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5429 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5430 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5431 typename iterator_traits<_InputIterator1>::value_type>)
5432 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5433 typename iterator_traits<_InputIterator2>::value_type>)
5434 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5435 typename iterator_traits<_InputIterator1>::value_type,
5436 typename iterator_traits<_InputIterator2>::value_type>)
5437 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5438 typename iterator_traits<_InputIterator2>::value_type,
5439 typename iterator_traits<_InputIterator1>::value_type>)
5440 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
5441 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
5442 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
5443 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
5444
5445 return _GLIBCXX_STD_Astd::__set_symmetric_difference(__first1, __last1,
5446 __first2, __last2, __result,
5447 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5448 }
5449
5450 template<typename _ForwardIterator, typename _Compare>
5451 _GLIBCXX14_CONSTEXPRconstexpr
5452 _ForwardIterator
5453 __min_element(_ForwardIterator __first, _ForwardIterator __last,
5454 _Compare __comp)
5455 {
5456 if (__first == __last)
5457 return __first;
5458 _ForwardIterator __result = __first;
5459 while (++__first != __last)
5460 if (__comp(__first, __result))
5461 __result = __first;
5462 return __result;
5463 }
5464
5465 /**
5466 * @brief Return the minimum element in a range.
5467 * @ingroup sorting_algorithms
5468 * @param __first Start of range.
5469 * @param __last End of range.
5470 * @return Iterator referencing the first instance of the smallest value.
5471 */
5472 template<typename _ForwardIterator>
5473 _GLIBCXX14_CONSTEXPRconstexpr
5474 _ForwardIterator
5475 inline min_element(_ForwardIterator __first, _ForwardIterator __last)
5476 {
5477 // concept requirements
5478 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
5479 __glibcxx_function_requires(_LessThanComparableConcept<
5480 typename iterator_traits<_ForwardIterator>::value_type>)
5481 __glibcxx_requires_valid_range(__first, __last);
5482 __glibcxx_requires_irreflexive(__first, __last);
5483
5484 return _GLIBCXX_STD_Astd::__min_element(__first, __last,
5485 __gnu_cxx::__ops::__iter_less_iter());
5486 }
5487
5488 /**
5489 * @brief Return the minimum element in a range using comparison functor.
5490 * @ingroup sorting_algorithms
5491 * @param __first Start of range.
5492 * @param __last End of range.
5493 * @param __comp Comparison functor.
5494 * @return Iterator referencing the first instance of the smallest value
5495 * according to __comp.
5496 */
5497 template<typename _ForwardIterator, typename _Compare>
5498 _GLIBCXX14_CONSTEXPRconstexpr
5499 inline _ForwardIterator
5500 min_element(_ForwardIterator __first, _ForwardIterator __last,
5501 _Compare __comp)
5502 {
5503 // concept requirements
5504 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
5505 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5506 typename iterator_traits<_ForwardIterator>::value_type,
5507 typename iterator_traits<_ForwardIterator>::value_type>)
5508 __glibcxx_requires_valid_range(__first, __last);
5509 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
5510
5511 return _GLIBCXX_STD_Astd::__min_element(__first, __last,
5512 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5513 }
5514
5515 template<typename _ForwardIterator, typename _Compare>
5516 _GLIBCXX14_CONSTEXPRconstexpr
5517 _ForwardIterator
5518 __max_element(_ForwardIterator __first, _ForwardIterator __last,
5519 _Compare __comp)
5520 {
5521 if (__first == __last) return __first;
5522 _ForwardIterator __result = __first;
5523 while (++__first != __last)
5524 if (__comp(__result, __first))
5525 __result = __first;
5526 return __result;
5527 }
5528
5529 /**
5530 * @brief Return the maximum element in a range.
5531 * @ingroup sorting_algorithms
5532 * @param __first Start of range.
5533 * @param __last End of range.
5534 * @return Iterator referencing the first instance of the largest value.
5535 */
5536 template<typename _ForwardIterator>
5537 _GLIBCXX14_CONSTEXPRconstexpr
5538 inline _ForwardIterator
5539 max_element(_ForwardIterator __first, _ForwardIterator __last)
5540 {
5541 // concept requirements
5542 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
5543 __glibcxx_function_requires(_LessThanComparableConcept<
5544 typename iterator_traits<_ForwardIterator>::value_type>)
5545 __glibcxx_requires_valid_range(__first, __last);
5546 __glibcxx_requires_irreflexive(__first, __last);
5547
5548 return _GLIBCXX_STD_Astd::__max_element(__first, __last,
5549 __gnu_cxx::__ops::__iter_less_iter());
5550 }
5551
5552 /**
5553 * @brief Return the maximum element in a range using comparison functor.
5554 * @ingroup sorting_algorithms
5555 * @param __first Start of range.
5556 * @param __last End of range.
5557 * @param __comp Comparison functor.
5558 * @return Iterator referencing the first instance of the largest value
5559 * according to __comp.
5560 */
5561 template<typename _ForwardIterator, typename _Compare>
5562 _GLIBCXX14_CONSTEXPRconstexpr
5563 inline _ForwardIterator
5564 max_element(_ForwardIterator __first, _ForwardIterator __last,
5565 _Compare __comp)
5566 {
5567 // concept requirements
5568 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
5569 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5570 typename iterator_traits<_ForwardIterator>::value_type,
5571 typename iterator_traits<_ForwardIterator>::value_type>)
5572 __glibcxx_requires_valid_range(__first, __last);
5573 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
5574
5575 return _GLIBCXX_STD_Astd::__max_element(__first, __last,
5576 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5577 }
5578
5579_GLIBCXX_END_NAMESPACE_ALGO
5580} // namespace std
5581
5582#endif /* _STL_ALGO_H */

/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h

1//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the SmallVector class.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_ADT_SMALLVECTOR_H
14#define LLVM_ADT_SMALLVECTOR_H
15
16#include "llvm/ADT/iterator_range.h"
17#include "llvm/Support/Compiler.h"
18#include "llvm/Support/ErrorHandling.h"
19#include "llvm/Support/MathExtras.h"
20#include "llvm/Support/MemAlloc.h"
21#include "llvm/Support/type_traits.h"
22#include <algorithm>
23#include <cassert>
24#include <cstddef>
25#include <cstdlib>
26#include <cstring>
27#include <initializer_list>
28#include <iterator>
29#include <limits>
30#include <memory>
31#include <new>
32#include <type_traits>
33#include <utility>
34
35namespace llvm {
36
37/// This is all the stuff common to all SmallVectors.
38///
39/// The template parameter specifies the type which should be used to hold the
40/// Size and Capacity of the SmallVector, so it can be adjusted.
41/// Using 32 bit size is desirable to shrink the size of the SmallVector.
42/// Using 64 bit size is desirable for cases like SmallVector<char>, where a
43/// 32 bit size would limit the vector to ~4GB. SmallVectors are used for
44/// buffering bitcode output - which can exceed 4GB.
45template <class Size_T> class SmallVectorBase {
46protected:
47 void *BeginX;
48 Size_T Size = 0, Capacity;
49
50 /// The maximum value of the Size_T used.
51 static constexpr size_t SizeTypeMax() {
52 return std::numeric_limits<Size_T>::max();
53 }
54
55 SmallVectorBase() = delete;
56 SmallVectorBase(void *FirstEl, size_t TotalCapacity)
57 : BeginX(FirstEl), Capacity(TotalCapacity) {}
58
59 /// This is an implementation of the grow() method which only works
60 /// on POD-like data types and is out of line to reduce code duplication.
61 /// This function will report a fatal error if it cannot increase capacity.
62 void grow_pod(void *FirstEl, size_t MinSize, size_t TSize);
63
64 /// Report that MinSize doesn't fit into this vector's size type. Throws
65 /// std::length_error or calls report_fatal_error.
66 LLVM_ATTRIBUTE_NORETURN__attribute__((noreturn)) static void report_size_overflow(size_t MinSize);
67 /// Report that this vector is already at maximum capacity. Throws
68 /// std::length_error or calls report_fatal_error.
69 LLVM_ATTRIBUTE_NORETURN__attribute__((noreturn)) static void report_at_maximum_capacity();
70
71public:
72 size_t size() const { return Size; }
67
Returning zero
73 size_t capacity() const { return Capacity; }
74
75 LLVM_NODISCARD[[clang::warn_unused_result]] bool empty() const { return !Size; }
76
77 /// Set the array size to \p N, which the current array must have enough
78 /// capacity for.
79 ///
80 /// This does not construct or destroy any elements in the vector.
81 ///
82 /// Clients can use this in conjunction with capacity() to write past the end
83 /// of the buffer when they know that more elements are available, and only
84 /// update the size later. This avoids the cost of value initializing elements
85 /// which will only be overwritten.
86 void set_size(size_t N) {
87 assert(N <= capacity())((N <= capacity()) ? static_cast<void> (0) : __assert_fail
("N <= capacity()", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 87, __PRETTY_FUNCTION__))
;
88 Size = N;
89 }
90};
91
92template <class T>
93using SmallVectorSizeType =
94 typename std::conditional<sizeof(T) < 4 && sizeof(void *) >= 8, uint64_t,
95 uint32_t>::type;
96
97/// Figure out the offset of the first element.
98template <class T, typename = void> struct SmallVectorAlignmentAndSize {
99 alignas(SmallVectorBase<SmallVectorSizeType<T>>) char Base[sizeof(
100 SmallVectorBase<SmallVectorSizeType<T>>)];
101 alignas(T) char FirstEl[sizeof(T)];
102};
103
104/// This is the part of SmallVectorTemplateBase which does not depend on whether
105/// the type T is a POD. The extra dummy template argument is used by ArrayRef
106/// to avoid unnecessarily requiring T to be complete.
107template <typename T, typename = void>
108class SmallVectorTemplateCommon
109 : public SmallVectorBase<SmallVectorSizeType<T>> {
110 using Base = SmallVectorBase<SmallVectorSizeType<T>>;
111
112 /// Find the address of the first element. For this pointer math to be valid
113 /// with small-size of 0 for T with lots of alignment, it's important that
114 /// SmallVectorStorage is properly-aligned even for small-size of 0.
115 void *getFirstEl() const {
116 return const_cast<void *>(reinterpret_cast<const void *>(
117 reinterpret_cast<const char *>(this) +
118 offsetof(SmallVectorAlignmentAndSize<T>, FirstEl)__builtin_offsetof(SmallVectorAlignmentAndSize<T>, FirstEl
)
));
119 }
120 // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
121
122protected:
123 SmallVectorTemplateCommon(size_t Size) : Base(getFirstEl(), Size) {}
124
125 void grow_pod(size_t MinSize, size_t TSize) {
126 Base::grow_pod(getFirstEl(), MinSize, TSize);
127 }
128
129 /// Return true if this is a smallvector which has not had dynamic
130 /// memory allocated for it.
131 bool isSmall() const { return this->BeginX == getFirstEl(); }
132
133 /// Put this vector in a state of being small.
134 void resetToSmall() {
135 this->BeginX = getFirstEl();
136 this->Size = this->Capacity = 0; // FIXME: Setting Capacity to 0 is suspect.
137 }
138
139 /// Return true unless Elt will be invalidated by resizing the vector to
140 /// NewSize.
141 bool isSafeToReferenceAfterResize(const void *Elt, size_t NewSize) {
142 // Past the end.
143 if (LLVM_LIKELY(Elt < this->begin() || Elt >= this->end())__builtin_expect((bool)(Elt < this->begin() || Elt >=
this->end()), true)
)
144 return true;
145
146 // Return false if Elt will be destroyed by shrinking.
147 if (NewSize <= this->size())
148 return Elt < this->begin() + NewSize;
149
150 // Return false if we need to grow.
151 return NewSize <= this->capacity();
152 }
153
154 /// Check whether Elt will be invalidated by resizing the vector to NewSize.
155 void assertSafeToReferenceAfterResize(const void *Elt, size_t NewSize) {
156 assert(isSafeToReferenceAfterResize(Elt, NewSize) &&((isSafeToReferenceAfterResize(Elt, NewSize) && "Attempting to reference an element of the vector in an operation "
"that invalidates it") ? static_cast<void> (0) : __assert_fail
("isSafeToReferenceAfterResize(Elt, NewSize) && \"Attempting to reference an element of the vector in an operation \" \"that invalidates it\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 158, __PRETTY_FUNCTION__))
157 "Attempting to reference an element of the vector in an operation "((isSafeToReferenceAfterResize(Elt, NewSize) && "Attempting to reference an element of the vector in an operation "
"that invalidates it") ? static_cast<void> (0) : __assert_fail
("isSafeToReferenceAfterResize(Elt, NewSize) && \"Attempting to reference an element of the vector in an operation \" \"that invalidates it\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 158, __PRETTY_FUNCTION__))
158 "that invalidates it")((isSafeToReferenceAfterResize(Elt, NewSize) && "Attempting to reference an element of the vector in an operation "
"that invalidates it") ? static_cast<void> (0) : __assert_fail
("isSafeToReferenceAfterResize(Elt, NewSize) && \"Attempting to reference an element of the vector in an operation \" \"that invalidates it\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 158, __PRETTY_FUNCTION__))
;
159 }
160
161 /// Check whether Elt will be invalidated by increasing the size of the
162 /// vector by N.
163 void assertSafeToAdd(const void *Elt, size_t N = 1) {
164 this->assertSafeToReferenceAfterResize(Elt, this->size() + N);
165 }
166
167 /// Check whether any part of the range will be invalidated by clearing.
168 void assertSafeToReferenceAfterClear(const T *From, const T *To) {
169 if (From == To)
170 return;
171 this->assertSafeToReferenceAfterResize(From, 0);
172 this->assertSafeToReferenceAfterResize(To - 1, 0);
173 }
174 template <
175 class ItTy,
176 std::enable_if_t<!std::is_same<std::remove_const_t<ItTy>, T *>::value,
177 bool> = false>
178 void assertSafeToReferenceAfterClear(ItTy, ItTy) {}
179
180 /// Check whether any part of the range will be invalidated by growing.
181 void assertSafeToAddRange(const T *From, const T *To) {
182 if (From == To)
183 return;
184 this->assertSafeToAdd(From, To - From);
185 this->assertSafeToAdd(To - 1, To - From);
186 }
187 template <
188 class ItTy,
189 std::enable_if_t<!std::is_same<std::remove_const_t<ItTy>, T *>::value,
190 bool> = false>
191 void assertSafeToAddRange(ItTy, ItTy) {}
192
193 /// Check whether any argument will be invalidated by growing for
194 /// emplace_back.
195 template <class ArgType1, class... ArgTypes>
196 void assertSafeToEmplace(ArgType1 &Arg1, ArgTypes &... Args) {
197 this->assertSafeToAdd(&Arg1);
198 this->assertSafeToEmplace(Args...);
199 }
200 void assertSafeToEmplace() {}
201
202public:
203 using size_type = size_t;
204 using difference_type = ptrdiff_t;
205 using value_type = T;
206 using iterator = T *;
207 using const_iterator = const T *;
208
209 using const_reverse_iterator = std::reverse_iterator<const_iterator>;
210 using reverse_iterator = std::reverse_iterator<iterator>;
211
212 using reference = T &;
213 using const_reference = const T &;
214 using pointer = T *;
215 using const_pointer = const T *;
216
217 using Base::capacity;
218 using Base::empty;
219 using Base::size;
220
221 // forward iterator creation methods.
222 iterator begin() { return (iterator)this->BeginX; }
223 const_iterator begin() const { return (const_iterator)this->BeginX; }
224 iterator end() { return begin() + size(); }
225 const_iterator end() const { return begin() + size(); }
226
227 // reverse iterator creation methods.
228 reverse_iterator rbegin() { return reverse_iterator(end()); }
229 const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
230 reverse_iterator rend() { return reverse_iterator(begin()); }
231 const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
232
233 size_type size_in_bytes() const { return size() * sizeof(T); }
234 size_type max_size() const {
235 return std::min(this->SizeTypeMax(), size_type(-1) / sizeof(T));
236 }
237
238 size_t capacity_in_bytes() const { return capacity() * sizeof(T); }
239
240 /// Return a pointer to the vector's buffer, even if empty().
241 pointer data() { return pointer(begin()); }
242 /// Return a pointer to the vector's buffer, even if empty().
243 const_pointer data() const { return const_pointer(begin()); }
244
245 reference operator[](size_type idx) {
246 assert(idx < size())((idx < size()) ? static_cast<void> (0) : __assert_fail
("idx < size()", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 246, __PRETTY_FUNCTION__))
;
247 return begin()[idx];
248 }
249 const_reference operator[](size_type idx) const {
250 assert(idx < size())((idx < size()) ? static_cast<void> (0) : __assert_fail
("idx < size()", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 250, __PRETTY_FUNCTION__))
;
251 return begin()[idx];
252 }
253
254 reference front() {
255 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 255, __PRETTY_FUNCTION__))
;
256 return begin()[0];
257 }
258 const_reference front() const {
259 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 259, __PRETTY_FUNCTION__))
;
260 return begin()[0];
261 }
262
263 reference back() {
264 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 264, __PRETTY_FUNCTION__))
;
265 return end()[-1];
266 }
267 const_reference back() const {
268 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 268, __PRETTY_FUNCTION__))
;
269 return end()[-1];
270 }
271};
272
273/// SmallVectorTemplateBase<TriviallyCopyable = false> - This is where we put
274/// method implementations that are designed to work with non-trivial T's.
275///
276/// We approximate is_trivially_copyable with trivial move/copy construction and
277/// trivial destruction. While the standard doesn't specify that you're allowed
278/// copy these types with memcpy, there is no way for the type to observe this.
279/// This catches the important case of std::pair<POD, POD>, which is not
280/// trivially assignable.
281template <typename T, bool = (is_trivially_copy_constructible<T>::value) &&
282 (is_trivially_move_constructible<T>::value) &&
283 std::is_trivially_destructible<T>::value>
284class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
285protected:
286 SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
287
288 static void destroy_range(T *S, T *E) {
289 while (S != E) {
290 --E;
291 E->~T();
292 }
293 }
294
295 /// Move the range [I, E) into the uninitialized memory starting with "Dest",
296 /// constructing elements as needed.
297 template<typename It1, typename It2>
298 static void uninitialized_move(It1 I, It1 E, It2 Dest) {
299 std::uninitialized_copy(std::make_move_iterator(I),
300 std::make_move_iterator(E), Dest);
301 }
302
303 /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
304 /// constructing elements as needed.
305 template<typename It1, typename It2>
306 static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
307 std::uninitialized_copy(I, E, Dest);
308 }
309
310 /// Grow the allocated memory (without initializing new elements), doubling
311 /// the size of the allocated memory. Guarantees space for at least one more
312 /// element, or MinSize more elements if specified.
313 void grow(size_t MinSize = 0);
314
315public:
316 void push_back(const T &Elt) {
317 this->assertSafeToAdd(&Elt);
318 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
319 this->grow();
320 ::new ((void*) this->end()) T(Elt);
321 this->set_size(this->size() + 1);
322 }
323
324 void push_back(T &&Elt) {
325 this->assertSafeToAdd(&Elt);
326 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
327 this->grow();
328 ::new ((void*) this->end()) T(::std::move(Elt));
329 this->set_size(this->size() + 1);
330 }
331
332 void pop_back() {
333 this->set_size(this->size() - 1);
334 this->end()->~T();
335 }
336};
337
338// Define this out-of-line to dissuade the C++ compiler from inlining it.
339template <typename T, bool TriviallyCopyable>
340void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) {
341 // Ensure we can fit the new capacity.
342 // This is only going to be applicable when the capacity is 32 bit.
343 if (MinSize > this->SizeTypeMax())
344 this->report_size_overflow(MinSize);
345
346 // Ensure we can meet the guarantee of space for at least one more element.
347 // The above check alone will not catch the case where grow is called with a
348 // default MinSize of 0, but the current capacity cannot be increased.
349 // This is only going to be applicable when the capacity is 32 bit.
350 if (this->capacity() == this->SizeTypeMax())
351 this->report_at_maximum_capacity();
352
353 // Always grow, even from zero.
354 size_t NewCapacity = size_t(NextPowerOf2(this->capacity() + 2));
355 NewCapacity = std::min(std::max(NewCapacity, MinSize), this->SizeTypeMax());
356 T *NewElts = static_cast<T*>(llvm::safe_malloc(NewCapacity*sizeof(T)));
357
358 // Move the elements over.
359 this->uninitialized_move(this->begin(), this->end(), NewElts);
360
361 // Destroy the original elements.
362 destroy_range(this->begin(), this->end());
363
364 // If this wasn't grown from the inline copy, deallocate the old space.
365 if (!this->isSmall())
366 free(this->begin());
367
368 this->BeginX = NewElts;
369 this->Capacity = NewCapacity;
370}
371
372/// SmallVectorTemplateBase<TriviallyCopyable = true> - This is where we put
373/// method implementations that are designed to work with trivially copyable
374/// T's. This allows using memcpy in place of copy/move construction and
375/// skipping destruction.
376template <typename T>
377class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
378protected:
379 SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
380
381 // No need to do a destroy loop for POD's.
382 static void destroy_range(T *, T *) {}
383
384 /// Move the range [I, E) onto the uninitialized memory
385 /// starting with "Dest", constructing elements into it as needed.
386 template<typename It1, typename It2>
387 static void uninitialized_move(It1 I, It1 E, It2 Dest) {
388 // Just do a copy.
389 uninitialized_copy(I, E, Dest);
390 }
391
392 /// Copy the range [I, E) onto the uninitialized memory
393 /// starting with "Dest", constructing elements into it as needed.
394 template<typename It1, typename It2>
395 static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
396 // Arbitrary iterator types; just use the basic implementation.
397 std::uninitialized_copy(I, E, Dest);
398 }
399
400 /// Copy the range [I, E) onto the uninitialized memory
401 /// starting with "Dest", constructing elements into it as needed.
402 template <typename T1, typename T2>
403 static void uninitialized_copy(
404 T1 *I, T1 *E, T2 *Dest,
405 std::enable_if_t<std::is_same<typename std::remove_const<T1>::type,
406 T2>::value> * = nullptr) {
407 // Use memcpy for PODs iterated by pointers (which includes SmallVector
408 // iterators): std::uninitialized_copy optimizes to memmove, but we can
409 // use memcpy here. Note that I and E are iterators and thus might be
410 // invalid for memcpy if they are equal.
411 if (I != E)
412 memcpy(reinterpret_cast<void *>(Dest), I, (E - I) * sizeof(T));
413 }
414
415 /// Double the size of the allocated memory, guaranteeing space for at
416 /// least one more element or MinSize if specified.
417 void grow(size_t MinSize = 0) { this->grow_pod(MinSize, sizeof(T)); }
418
419public:
420 void push_back(const T &Elt) {
421 this->assertSafeToAdd(&Elt);
422 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
423 this->grow();
424 memcpy(reinterpret_cast<void *>(this->end()), &Elt, sizeof(T));
425 this->set_size(this->size() + 1);
426 }
427
428 void pop_back() { this->set_size(this->size() - 1); }
429};
430
431/// This class consists of common code factored out of the SmallVector class to
432/// reduce code duplication based on the SmallVector 'N' template parameter.
433template <typename T>
434class SmallVectorImpl : public SmallVectorTemplateBase<T> {
435 using SuperClass = SmallVectorTemplateBase<T>;
436
437public:
438 using iterator = typename SuperClass::iterator;
439 using const_iterator = typename SuperClass::const_iterator;
440 using reference = typename SuperClass::reference;
441 using size_type = typename SuperClass::size_type;
442
443protected:
444 // Default ctor - Initialize to empty.
445 explicit SmallVectorImpl(unsigned N)
446 : SmallVectorTemplateBase<T>(N) {}
447
448public:
449 SmallVectorImpl(const SmallVectorImpl &) = delete;
450
451 ~SmallVectorImpl() {
452 // Subclass has already destructed this vector's elements.
453 // If this wasn't grown from the inline copy, deallocate the old space.
454 if (!this->isSmall())
455 free(this->begin());
456 }
457
458 void clear() {
459 this->destroy_range(this->begin(), this->end());
460 this->Size = 0;
461 }
462
463 void resize(size_type N) {
464 if (N < this->size()) {
465 this->destroy_range(this->begin()+N, this->end());
466 this->set_size(N);
467 } else if (N > this->size()) {
468 if (this->capacity() < N)
469 this->grow(N);
470 for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
471 new (&*I) T();
472 this->set_size(N);
473 }
474 }
475
476 void resize(size_type N, const T &NV) {
477 if (N == this->size())
478 return;
479
480 if (N < this->size()) {
481 this->destroy_range(this->begin()+N, this->end());
482 this->set_size(N);
483 return;
484 }
485
486 this->assertSafeToReferenceAfterResize(&NV, N);
487 if (this->capacity() < N)
488 this->grow(N);
489 std::uninitialized_fill(this->end(), this->begin() + N, NV);
490 this->set_size(N);
491 }
492
493 void reserve(size_type N) {
494 if (this->capacity() < N)
495 this->grow(N);
496 }
497
498 void pop_back_n(size_type NumItems) {
499 assert(this->size() >= NumItems)((this->size() >= NumItems) ? static_cast<void> (
0) : __assert_fail ("this->size() >= NumItems", "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 499, __PRETTY_FUNCTION__))
;
500 this->destroy_range(this->end() - NumItems, this->end());
501 this->set_size(this->size() - NumItems);
502 }
503
504 LLVM_NODISCARD[[clang::warn_unused_result]] T pop_back_val() {
505 T Result = ::std::move(this->back());
506 this->pop_back();
507 return Result;
508 }
509
510 void swap(SmallVectorImpl &RHS);
511
512 /// Add the specified range to the end of the SmallVector.
513 template <typename in_iter,
514 typename = std::enable_if_t<std::is_convertible<
515 typename std::iterator_traits<in_iter>::iterator_category,
516 std::input_iterator_tag>::value>>
517 void append(in_iter in_start, in_iter in_end) {
518 this->assertSafeToAddRange(in_start, in_end);
519 size_type NumInputs = std::distance(in_start, in_end);
520 if (NumInputs > this->capacity() - this->size())
521 this->grow(this->size()+NumInputs);
522
523 this->uninitialized_copy(in_start, in_end, this->end());
524 this->set_size(this->size() + NumInputs);
525 }
526
527 /// Append \p NumInputs copies of \p Elt to the end.
528 void append(size_type NumInputs, const T &Elt) {
529 this->assertSafeToAdd(&Elt, NumInputs);
530 if (NumInputs > this->capacity() - this->size())
531 this->grow(this->size()+NumInputs);
532
533 std::uninitialized_fill_n(this->end(), NumInputs, Elt);
534 this->set_size(this->size() + NumInputs);
535 }
536
537 void append(std::initializer_list<T> IL) {
538 append(IL.begin(), IL.end());
539 }
540
541 // FIXME: Consider assigning over existing elements, rather than clearing &
542 // re-initializing them - for all assign(...) variants.
543
544 void assign(size_type NumElts, const T &Elt) {
545 this->assertSafeToReferenceAfterResize(&Elt, 0);
546 clear();
547 if (this->capacity() < NumElts)
548 this->grow(NumElts);
549 this->set_size(NumElts);
550 std::uninitialized_fill(this->begin(), this->end(), Elt);
551 }
552
553 template <typename in_iter,
554 typename = std::enable_if_t<std::is_convertible<
555 typename std::iterator_traits<in_iter>::iterator_category,
556 std::input_iterator_tag>::value>>
557 void assign(in_iter in_start, in_iter in_end) {
558 this->assertSafeToReferenceAfterClear(in_start, in_end);
559 clear();
560 append(in_start, in_end);
561 }
562
563 void assign(std::initializer_list<T> IL) {
564 clear();
565 append(IL);
566 }
567
568 iterator erase(const_iterator CI) {
569 // Just cast away constness because this is a non-const member function.
570 iterator I = const_cast<iterator>(CI);
571
572 assert(I >= this->begin() && "Iterator to erase is out of bounds.")((I >= this->begin() && "Iterator to erase is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Iterator to erase is out of bounds.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 572, __PRETTY_FUNCTION__))
;
573 assert(I < this->end() && "Erasing at past-the-end iterator.")((I < this->end() && "Erasing at past-the-end iterator."
) ? static_cast<void> (0) : __assert_fail ("I < this->end() && \"Erasing at past-the-end iterator.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 573, __PRETTY_FUNCTION__))
;
574
575 iterator N = I;
576 // Shift all elts down one.
577 std::move(I+1, this->end(), I);
578 // Drop the last elt.
579 this->pop_back();
580 return(N);
581 }
582
583 iterator erase(const_iterator CS, const_iterator CE) {
584 // Just cast away constness because this is a non-const member function.
585 iterator S = const_cast<iterator>(CS);
586 iterator E = const_cast<iterator>(CE);
587
588 assert(S >= this->begin() && "Range to erase is out of bounds.")((S >= this->begin() && "Range to erase is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("S >= this->begin() && \"Range to erase is out of bounds.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 588, __PRETTY_FUNCTION__))
;
589 assert(S <= E && "Trying to erase invalid range.")((S <= E && "Trying to erase invalid range.") ? static_cast
<void> (0) : __assert_fail ("S <= E && \"Trying to erase invalid range.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 589, __PRETTY_FUNCTION__))
;
590 assert(E <= this->end() && "Trying to erase past the end.")((E <= this->end() && "Trying to erase past the end."
) ? static_cast<void> (0) : __assert_fail ("E <= this->end() && \"Trying to erase past the end.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 590, __PRETTY_FUNCTION__))
;
591
592 iterator N = S;
593 // Shift all elts down.
594 iterator I = std::move(E, this->end(), S);
595 // Drop the last elts.
596 this->destroy_range(I, this->end());
597 this->set_size(I - this->begin());
598 return(N);
599 }
600
601private:
602 template <class ArgType> iterator insert_one_impl(iterator I, ArgType &&Elt) {
603 if (I == this->end()) { // Important special case for empty vector.
604 this->push_back(::std::forward<ArgType>(Elt));
605 return this->end()-1;
606 }
607
608 assert(I >= this->begin() && "Insertion iterator is out of bounds.")((I >= this->begin() && "Insertion iterator is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Insertion iterator is out of bounds.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 608, __PRETTY_FUNCTION__))
;
609 assert(I <= this->end() && "Inserting past the end of the vector.")((I <= this->end() && "Inserting past the end of the vector."
) ? static_cast<void> (0) : __assert_fail ("I <= this->end() && \"Inserting past the end of the vector.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 609, __PRETTY_FUNCTION__))
;
610
611 // Check that adding an element won't invalidate Elt.
612 this->assertSafeToAdd(&Elt);
613
614 if (this->size() >= this->capacity()) {
615 size_t EltNo = I-this->begin();
616 this->grow();
617 I = this->begin()+EltNo;
618 }
619
620 ::new ((void*) this->end()) T(::std::move(this->back()));
621 // Push everything else over.
622 std::move_backward(I, this->end()-1, this->end());
623 this->set_size(this->size() + 1);
624
625 // If we just moved the element we're inserting, be sure to update
626 // the reference.
627 std::remove_reference_t<ArgType> *EltPtr = &Elt;
628 if (I <= EltPtr && EltPtr < this->end())
629 ++EltPtr;
630
631 *I = ::std::forward<ArgType>(*EltPtr);
632 return I;
633 }
634
635public:
636 iterator insert(iterator I, T &&Elt) {
637 return insert_one_impl(I, std::move(Elt));
638 }
639
640 iterator insert(iterator I, const T &Elt) { return insert_one_impl(I, Elt); }
641
642 iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
643 // Convert iterator to elt# to avoid invalidating iterator when we reserve()
644 size_t InsertElt = I - this->begin();
645
646 if (I == this->end()) { // Important special case for empty vector.
647 append(NumToInsert, Elt);
648 return this->begin()+InsertElt;
649 }
650
651 assert(I >= this->begin() && "Insertion iterator is out of bounds.")((I >= this->begin() && "Insertion iterator is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Insertion iterator is out of bounds.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 651, __PRETTY_FUNCTION__))
;
652 assert(I <= this->end() && "Inserting past the end of the vector.")((I <= this->end() && "Inserting past the end of the vector."
) ? static_cast<void> (0) : __assert_fail ("I <= this->end() && \"Inserting past the end of the vector.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 652, __PRETTY_FUNCTION__))
;
653
654 // Check that adding NumToInsert elements won't invalidate Elt.
655 this->assertSafeToAdd(&Elt, NumToInsert);
656
657 // Ensure there is enough space.
658 reserve(this->size() + NumToInsert);
659
660 // Uninvalidate the iterator.
661 I = this->begin()+InsertElt;
662
663 // If there are more elements between the insertion point and the end of the
664 // range than there are being inserted, we can use a simple approach to
665 // insertion. Since we already reserved space, we know that this won't
666 // reallocate the vector.
667 if (size_t(this->end()-I) >= NumToInsert) {
668 T *OldEnd = this->end();
669 append(std::move_iterator<iterator>(this->end() - NumToInsert),
670 std::move_iterator<iterator>(this->end()));
671
672 // Copy the existing elements that get replaced.
673 std::move_backward(I, OldEnd-NumToInsert, OldEnd);
674
675 std::fill_n(I, NumToInsert, Elt);
676 return I;
677 }
678
679 // Otherwise, we're inserting more elements than exist already, and we're
680 // not inserting at the end.
681
682 // Move over the elements that we're about to overwrite.
683 T *OldEnd = this->end();
684 this->set_size(this->size() + NumToInsert);
685 size_t NumOverwritten = OldEnd-I;
686 this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
687
688 // Replace the overwritten part.
689 std::fill_n(I, NumOverwritten, Elt);
690
691 // Insert the non-overwritten middle part.
692 std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt);
693 return I;
694 }
695
696 template <typename ItTy,
697 typename = std::enable_if_t<std::is_convertible<
698 typename std::iterator_traits<ItTy>::iterator_category,
699 std::input_iterator_tag>::value>>
700 iterator insert(iterator I, ItTy From, ItTy To) {
701 // Convert iterator to elt# to avoid invalidating iterator when we reserve()
702 size_t InsertElt = I - this->begin();
703
704 if (I == this->end()) { // Important special case for empty vector.
705 append(From, To);
706 return this->begin()+InsertElt;
707 }
708
709 assert(I >= this->begin() && "Insertion iterator is out of bounds.")((I >= this->begin() && "Insertion iterator is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Insertion iterator is out of bounds.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 709, __PRETTY_FUNCTION__))
;
710 assert(I <= this->end() && "Inserting past the end of the vector.")((I <= this->end() && "Inserting past the end of the vector."
) ? static_cast<void> (0) : __assert_fail ("I <= this->end() && \"Inserting past the end of the vector.\""
, "/build/llvm-toolchain-snapshot-12~++20201211111113+08280c4b734/llvm/include/llvm/ADT/SmallVector.h"
, 710, __PRETTY_FUNCTION__))
;
711
712 // Check that the reserve that follows doesn't invalidate the iterators.
713 this->assertSafeToAddRange(From, To);
714
715 size_t NumToInsert = std::distance(From, To);
716
717 // Ensure there is enough space.
718 reserve(this->size() + NumToInsert);
719
720 // Uninvalidate the iterator.
721 I = this->begin()+InsertElt;
722
723 // If there are more elements between the insertion point and the end of the
724 // range than there are being inserted, we can use a simple approach to
725 // insertion. Since we already reserved space, we know that this won't
726 // reallocate the vector.
727 if (size_t(this->end()-I) >= NumToInsert) {
728 T *OldEnd = this->end();
729 append(std::move_iterator<iterator>(this->end() - NumToInsert),
730 std::move_iterator<iterator>(this->end()));
731
732 // Copy the existing elements that get replaced.
733 std::move_backward(I, OldEnd-NumToInsert, OldEnd);
734
735 std::copy(From, To, I);
736 return I;
737 }
738
739 // Otherwise, we're inserting more elements than exist already, and we're
740 // not inserting at the end.
741
742 // Move over the elements that we're about to overwrite.
743 T *OldEnd = this->end();
744 this->set_size(this->size() + NumToInsert);
745 size_t NumOverwritten = OldEnd-I;
746 this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
747
748 // Replace the overwritten part.
749 for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
750 *J = *From;
751 ++J; ++From;
752 }
753
754 // Insert the non-overwritten middle part.
755 this->uninitialized_copy(From, To, OldEnd);
756 return I;
757 }
758
759 void insert(iterator I, std::initializer_list<T> IL) {
760 insert(I, IL.begin(), IL.end());
761 }
762
763 template <typename... ArgTypes> reference emplace_back(ArgTypes &&... Args) {
764 this->assertSafeToEmplace(Args...);
765 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
766 this->grow();
767 ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
768 this->set_size(this->size() + 1);
769 return this->back();
770 }
771
772 SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
773
774 SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
775
776 bool operator==(const SmallVectorImpl &RHS) const {
777 if (this->size() != RHS.size()) return false;
778 return std::equal(this->begin(), this->end(), RHS.begin());
779 }
780 bool operator!=(const SmallVectorImpl &RHS) const {
781 return !(*this == RHS);
782 }
783
784 bool operator<(const SmallVectorImpl &RHS) const {
785 return std::lexicographical_compare(this->begin(), this->end(),
786 RHS.begin(), RHS.end());
787 }
788};
789
790template <typename T>
791void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
792 if (this == &RHS) return;
793
794 // We can only avoid copying elements if neither vector is small.
795 if (!this->isSmall() && !RHS.isSmall()) {
796 std::swap(this->BeginX, RHS.BeginX);
797 std::swap(this->Size, RHS.Size);
798 std::swap(this->Capacity, RHS.Capacity);
799 return;
800 }
801 if (RHS.size() > this->capacity())
802 this->grow(RHS.size());
803 if (this->size() > RHS.capacity())
804 RHS.grow(this->size());
805
806 // Swap the shared elements.
807 size_t NumShared = this->size();
808 if (NumShared > RHS.size()) NumShared = RHS.size();
809 for (size_type i = 0; i != NumShared; ++i)
810 std::swap((*this)[i], RHS[i]);
811
812 // Copy over the extra elts.
813 if (this->size() > RHS.size()) {
814 size_t EltDiff = this->size() - RHS.size();
815 this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
816 RHS.set_size(RHS.size() + EltDiff);
817 this->destroy_range(this->begin()+NumShared, this->end());
818 this->set_size(NumShared);
819 } else if (RHS.size() > this->size()) {
820 size_t EltDiff = RHS.size() - this->size();
821 this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
822 this->set_size(this->size() + EltDiff);
823 this->destroy_range(RHS.begin()+NumShared, RHS.end());
824 RHS.set_size(NumShared);
825 }
826}
827
828template <typename T>
829SmallVectorImpl<T> &SmallVectorImpl<T>::
830 operator=(const SmallVectorImpl<T> &RHS) {
831 // Avoid self-assignment.
832 if (this == &RHS) return *this;
833
834 // If we already have sufficient space, assign the common elements, then
835 // destroy any excess.
836 size_t RHSSize = RHS.size();
837 size_t CurSize = this->size();
838 if (CurSize >= RHSSize) {
839 // Assign common elements.
840 iterator NewEnd;
841 if (RHSSize)
842 NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
843 else
844 NewEnd = this->begin();
845
846 // Destroy excess elements.
847 this->destroy_range(NewEnd, this->end());
848
849 // Trim.
850 this->set_size(RHSSize);
851 return *this;
852 }
853
854 // If we have to grow to have enough elements, destroy the current elements.
855 // This allows us to avoid copying them during the grow.
856 // FIXME: don't do this if they're efficiently moveable.
857 if (this->capacity() < RHSSize) {
858 // Destroy current elements.
859 this->destroy_range(this->begin(), this->end());
860 this->set_size(0);
861 CurSize = 0;
862 this->grow(RHSSize);
863 } else if (CurSize) {
864 // Otherwise, use assignment for the already-constructed elements.
865 std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
866 }
867
868 // Copy construct the new elements in place.
869 this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
870 this->begin()+CurSize);
871
872 // Set end.
873 this->set_size(RHSSize);
874 return *this;
875}
876
877template <typename T>
878SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
879 // Avoid self-assignment.
880 if (this == &RHS) return *this;
881
882 // If the RHS isn't small, clear this vector and then steal its buffer.
883 if (!RHS.isSmall()) {
884 this->destroy_range(this->begin(), this->end());
885 if (!this->isSmall()) free(this->begin());
886 this->BeginX = RHS.BeginX;
887 this->Size = RHS.Size;
888 this->Capacity = RHS.Capacity;
889 RHS.resetToSmall();
890 return *this;
891 }
892
893 // If we already have sufficient space, assign the common elements, then
894 // destroy any excess.
895 size_t RHSSize = RHS.size();
896 size_t CurSize = this->size();
897 if (CurSize >= RHSSize) {
898 // Assign common elements.
899 iterator NewEnd = this->begin();
900 if (RHSSize)
901 NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
902
903 // Destroy excess elements and trim the bounds.
904 this->destroy_range(NewEnd, this->end());
905 this->set_size(RHSSize);
906
907 // Clear the RHS.
908 RHS.clear();
909
910 return *this;
911 }
912
913 // If we have to grow to have enough elements, destroy the current elements.
914 // This allows us to avoid copying them during the grow.
915 // FIXME: this may not actually make any sense if we can efficiently move
916 // elements.
917 if (this->capacity() < RHSSize) {
918 // Destroy current elements.
919 this->destroy_range(this->begin(), this->end());
920 this->set_size(0);
921 CurSize = 0;
922 this->grow(RHSSize);
923 } else if (CurSize) {
924 // Otherwise, use assignment for the already-constructed elements.
925 std::move(RHS.begin(), RHS.begin()+CurSize, this->begin());
926 }
927
928 // Move-construct the new elements in place.
929 this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
930 this->begin()+CurSize);
931
932 // Set end.
933 this->set_size(RHSSize);
934
935 RHS.clear();
936 return *this;
937}
938
939/// Storage for the SmallVector elements. This is specialized for the N=0 case
940/// to avoid allocating unnecessary storage.
941template <typename T, unsigned N>
942struct SmallVectorStorage {
943 alignas(T) char InlineElts[N * sizeof(T)];
944};
945
946/// We need the storage to be properly aligned even for small-size of 0 so that
947/// the pointer math in \a SmallVectorTemplateCommon::getFirstEl() is
948/// well-defined.
949template <typename T> struct alignas(T) SmallVectorStorage<T, 0> {};
950
951/// Forward declaration of SmallVector so that
952/// calculateSmallVectorDefaultInlinedElements can reference
953/// `sizeof(SmallVector<T, 0>)`.
954template <typename T, unsigned N> class LLVM_GSL_OWNER[[gsl::Owner]] SmallVector;
955
956/// Helper class for calculating the default number of inline elements for
957/// `SmallVector<T>`.
958///
959/// This should be migrated to a constexpr function when our minimum
960/// compiler support is enough for multi-statement constexpr functions.
961template <typename T> struct CalculateSmallVectorDefaultInlinedElements {
962 // Parameter controlling the default number of inlined elements
963 // for `SmallVector<T>`.
964 //
965 // The default number of inlined elements ensures that
966 // 1. There is at least one inlined element.
967 // 2. `sizeof(SmallVector<T>) <= kPreferredSmallVectorSizeof` unless
968 // it contradicts 1.
969 static constexpr size_t kPreferredSmallVectorSizeof = 64;
970
971 // static_assert that sizeof(T) is not "too big".
972 //
973 // Because our policy guarantees at least one inlined element, it is possible
974 // for an arbitrarily large inlined element to allocate an arbitrarily large
975 // amount of inline storage. We generally consider it an antipattern for a
976 // SmallVector to allocate an excessive amount of inline storage, so we want
977 // to call attention to these cases and make sure that users are making an
978 // intentional decision if they request a lot of inline storage.
979 //
980 // We want this assertion to trigger in pathological cases, but otherwise
981 // not be too easy to hit. To accomplish that, the cutoff is actually somewhat
982 // larger than kPreferredSmallVectorSizeof (otherwise,
983 // `SmallVector<SmallVector<T>>` would be one easy way to trip it, and that
984 // pattern seems useful in practice).
985 //
986 // One wrinkle is that this assertion is in theory non-portable, since
987 // sizeof(T) is in general platform-dependent. However, we don't expect this
988 // to be much of an issue, because most LLVM development happens on 64-bit
989 // hosts, and therefore sizeof(T) is expected to *decrease* when compiled for
990 // 32-bit hosts, dodging the issue. The reverse situation, where development
991 // happens on a 32-bit host and then fails due to sizeof(T) *increasing* on a
992 // 64-bit host, is expected to be very rare.
993 static_assert(
994 sizeof(T) <= 256,
995 "You are trying to use a default number of inlined elements for "
996 "`SmallVector<T>` but `sizeof(T)` is really big! Please use an "
997 "explicit number of inlined elements with `SmallVector<T, N>` to make "
998 "sure you really want that much inline storage.");
999
1000 // Discount the size of the header itself when calculating the maximum inline
1001 // bytes.
1002 static constexpr size_t PreferredInlineBytes =
1003 kPreferredSmallVectorSizeof - sizeof(SmallVector<T, 0>);
1004 static constexpr size_t NumElementsThatFit = PreferredInlineBytes / sizeof(T);
1005 static constexpr size_t value =
1006 NumElementsThatFit == 0 ? 1 : NumElementsThatFit;
1007};
1008
1009/// This is a 'vector' (really, a variable-sized array), optimized
1010/// for the case when the array is small. It contains some number of elements
1011/// in-place, which allows it to avoid heap allocation when the actual number of
1012/// elements is below that threshold. This allows normal "small" cases to be
1013/// fast without losing generality for large inputs.
1014///
1015/// \note
1016/// In the absence of a well-motivated choice for the number of inlined
1017/// elements \p N, it is recommended to use \c SmallVector<T> (that is,
1018/// omitting the \p N). This will choose a default number of inlined elements
1019/// reasonable for allocation on the stack (for example, trying to keep \c
1020/// sizeof(SmallVector<T>) around 64 bytes).
1021///
1022/// \warning This does not attempt to be exception safe.
1023///
1024template <typename T,
1025 unsigned N = CalculateSmallVectorDefaultInlinedElements<T>::value>
1026class LLVM_GSL_OWNER[[gsl::Owner]] SmallVector : public SmallVectorImpl<T>,
1027 SmallVectorStorage<T, N> {
1028public:
1029 SmallVector() : SmallVectorImpl<T>(N) {}
1030
1031 ~SmallVector() {
1032 // Destroy the constructed elements in the vector.
1033 this->destroy_range(this->begin(), this->end());
1034 }
1035
1036 explicit SmallVector(size_t Size, const T &Value = T())
1037 : SmallVectorImpl<T>(N) {
1038 this->assign(Size, Value);
1039 }
1040
1041 template <typename ItTy,
1042 typename = std::enable_if_t<std::is_convertible<
1043 typename std::iterator_traits<ItTy>::iterator_category,
1044 std::input_iterator_tag>::value>>
1045 SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
1046 this->append(S, E);
1047 }
1048
1049 template <typename RangeTy>
1050 explicit SmallVector(const iterator_range<RangeTy> &R)
1051 : SmallVectorImpl<T>(N) {
1052 this->append(R.begin(), R.end());
1053 }
1054
1055 SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
1056 this->assign(IL);
1057 }
1058
1059 SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
1060 if (!RHS.empty())
1061 SmallVectorImpl<T>::operator=(RHS);
1062 }
1063
1064 SmallVector &operator=(const SmallVector &RHS) {
1065 SmallVectorImpl<T>::operator=(RHS);
1066 return *this;
1067 }
1068
1069 SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
1070 if (!RHS.empty())
1071 SmallVectorImpl<T>::operator=(::std::move(RHS));
1072 }
1073
1074 SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
1075 if (!RHS.empty())
1076 SmallVectorImpl<T>::operator=(::std::move(RHS));
1077 }
1078
1079 SmallVector &operator=(SmallVector &&RHS) {
1080 SmallVectorImpl<T>::operator=(::std::move(RHS));
1081 return *this;
1082 }
1083
1084 SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
1085 SmallVectorImpl<T>::operator=(::std::move(RHS));
1086 return *this;
1087 }
1088
1089 SmallVector &operator=(std::initializer_list<T> IL) {
1090 this->assign(IL);
1091 return *this;
1092 }
1093};
1094
1095template <typename T, unsigned N>
1096inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
1097 return X.capacity_in_bytes();
1098}
1099
1100/// Given a range of type R, iterate the entire range and return a
1101/// SmallVector with elements of the vector. This is useful, for example,
1102/// when you want to iterate a range and then sort the results.
1103template <unsigned Size, typename R>
1104SmallVector<typename std::remove_const<typename std::remove_reference<
1105 decltype(*std::begin(std::declval<R &>()))>::type>::type,
1106 Size>
1107to_vector(R &&Range) {
1108 return {std::begin(Range), std::end(Range)};
1109}
1110
1111} // end namespace llvm
1112
1113namespace std {
1114
1115 /// Implement std::swap in terms of SmallVector swap.
1116 template<typename T>
1117 inline void
1118 swap(llvm::SmallVectorImpl<T> &LHS, llvm::SmallVectorImpl<T> &RHS) {
1119 LHS.swap(RHS);
1120 }
1121
1122 /// Implement std::swap in terms of SmallVector swap.
1123 template<typename T, unsigned N>
1124 inline void
1125 swap(llvm::SmallVector<T, N> &LHS, llvm::SmallVector<T, N> &RHS) {
1126 LHS.swap(RHS);
1127 }
1128
1129} // end namespace std
1130
1131#endif // LLVM_ADT_SMALLVECTOR_H