LLVM 19.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
99 "x86-br-merging-likely-bias", cl::init(0),
100 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
101 "that all conditionals will be executed. For example for merging "
102 "the conditionals (a == b && c > d), if its known that a == b is "
103 "likely, then it is likely that if the conditionals are split "
104 "both sides will be executed, so it may be desirable to increase "
105 "the instruction cost threshold. Set to -1 to never merge likely "
106 "branches."),
107 cl::Hidden);
108
110 "x86-br-merging-unlikely-bias", cl::init(-1),
111 cl::desc(
112 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
113 "that all conditionals will be executed. For example for merging "
114 "the conditionals (a == b && c > d), if its known that a == b is "
115 "unlikely, then it is unlikely that if the conditionals are split "
116 "both sides will be executed, so it may be desirable to decrease "
117 "the instruction cost threshold. Set to -1 to never merge unlikely "
118 "branches."),
119 cl::Hidden);
120
122 "mul-constant-optimization", cl::init(true),
123 cl::desc("Replace 'mul x, Const' with more effective instructions like "
124 "SHIFT, LEA, etc."),
125 cl::Hidden);
126
128 const X86Subtarget &STI)
129 : TargetLowering(TM), Subtarget(STI) {
130 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
131 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
132
133 // Set up the TargetLowering object.
134
135 // X86 is weird. It always uses i8 for shift amounts and setcc results.
137 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
139
140 // X86 instruction cache is coherent with its data cache so we can use the
141 // default expansion to a no-op.
143
144 // For 64-bit, since we have so many registers, use the ILP scheduler.
145 // For 32-bit, use the register pressure specific scheduling.
146 // For Atom, always use ILP scheduling.
147 if (Subtarget.isAtom())
149 else if (Subtarget.is64Bit())
151 else
153 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
155
156 // Bypass expensive divides and use cheaper ones.
157 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
158 if (Subtarget.hasSlowDivide32())
159 addBypassSlowDiv(32, 8);
160 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
161 addBypassSlowDiv(64, 32);
162 }
163
164 // Setup Windows compiler runtime calls.
165 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
166 static const struct {
167 const RTLIB::Libcall Op;
168 const char * const Name;
169 const CallingConv::ID CC;
170 } LibraryCalls[] = {
171 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
172 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
173 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
174 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
175 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
176 };
177
178 for (const auto &LC : LibraryCalls) {
179 setLibcallName(LC.Op, LC.Name);
180 setLibcallCallingConv(LC.Op, LC.CC);
181 }
182 }
183
184 if (Subtarget.canUseCMPXCHG16B())
186 else if (Subtarget.canUseCMPXCHG8B())
188 else
190
191 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
192
194
195 // Set up the register classes.
196 addRegisterClass(MVT::i8, &X86::GR8RegClass);
197 addRegisterClass(MVT::i16, &X86::GR16RegClass);
198 addRegisterClass(MVT::i32, &X86::GR32RegClass);
199 if (Subtarget.is64Bit())
200 addRegisterClass(MVT::i64, &X86::GR64RegClass);
201
202 for (MVT VT : MVT::integer_valuetypes())
204
205 // We don't accept any truncstore of integer registers.
206 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
207 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
208 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
209 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
210 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
211 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
212
213 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
214
215 // SETOEQ and SETUNE require checking two conditions.
216 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
219 }
220
221 // Integer absolute.
222 if (Subtarget.canUseCMOV()) {
223 setOperationAction(ISD::ABS , MVT::i16 , Custom);
224 setOperationAction(ISD::ABS , MVT::i32 , Custom);
225 if (Subtarget.is64Bit())
226 setOperationAction(ISD::ABS , MVT::i64 , Custom);
227 }
228
229 // Absolute difference.
230 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
231 setOperationAction(Op , MVT::i8 , Custom);
232 setOperationAction(Op , MVT::i16 , Custom);
233 setOperationAction(Op , MVT::i32 , Custom);
234 if (Subtarget.is64Bit())
235 setOperationAction(Op , MVT::i64 , Custom);
236 }
237
238 // Signed saturation subtraction.
242 if (Subtarget.is64Bit())
244
245 // Funnel shifts.
246 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
247 // For slow shld targets we only lower for code size.
248 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
249
250 setOperationAction(ShiftOp , MVT::i8 , Custom);
251 setOperationAction(ShiftOp , MVT::i16 , Custom);
252 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
253 if (Subtarget.is64Bit())
254 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
255 }
256
257 if (!Subtarget.useSoftFloat()) {
258 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
259 // operation.
264 // We have an algorithm for SSE2, and we turn this into a 64-bit
265 // FILD or VCVTUSI2SS/SD for other targets.
268 // We have an algorithm for SSE2->double, and we turn this into a
269 // 64-bit FILD followed by conditional FADD for other targets.
272
273 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
274 // this operation.
277 // SSE has no i16 to fp conversion, only i32. We promote in the handler
278 // to allow f80 to use i16 and f64 to use i16 with sse1 only
281 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
284 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
285 // are Legal, f80 is custom lowered.
288
289 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
290 // this operation.
292 // FIXME: This doesn't generate invalid exception when it should. PR44019.
298 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
299 // are Legal, f80 is custom lowered.
302
303 // Handle FP_TO_UINT by promoting the destination to a larger signed
304 // conversion.
306 // FIXME: This doesn't generate invalid exception when it should. PR44019.
309 // FIXME: This doesn't generate invalid exception when it should. PR44019.
315
320
321 if (!Subtarget.is64Bit()) {
324 }
325 }
326
327 if (Subtarget.hasSSE2()) {
328 // Custom lowering for saturating float to int conversions.
329 // We handle promotion to larger result types manually.
330 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
333 }
334 if (Subtarget.is64Bit()) {
337 }
338 }
339
340 // Handle address space casts between mixed sized pointers.
343
344 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
345 if (!Subtarget.hasSSE2()) {
348 if (Subtarget.is64Bit()) {
350 // Without SSE, i64->f64 goes through memory.
352 }
353 } else if (!Subtarget.is64Bit())
355
356 // Scalar integer divide and remainder are lowered to use operations that
357 // produce two results, to match the available instructions. This exposes
358 // the two-result form to trivial CSE, which is able to combine x/y and x%y
359 // into a single instruction.
360 //
361 // Scalar integer multiply-high is also lowered to use two-result
362 // operations, to match the available instructions. However, plain multiply
363 // (low) operations are left as Legal, as there are single-result
364 // instructions for this in x86. Using the two-result multiply instructions
365 // when both high and low results are needed must be arranged by dagcombine.
366 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
373 }
374
375 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
377 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
378 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
381 }
382 if (Subtarget.is64Bit())
387
388 setOperationAction(ISD::FREM , MVT::f32 , Expand);
389 setOperationAction(ISD::FREM , MVT::f64 , Expand);
390 setOperationAction(ISD::FREM , MVT::f80 , Expand);
391 setOperationAction(ISD::FREM , MVT::f128 , Expand);
392
393 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 }
400
401 // Promote the i8 variants and force them on up to i32 which has a shorter
402 // encoding.
403 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
405 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
406 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
407 // promote that too.
408 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
410
411 if (!Subtarget.hasBMI()) {
412 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
414 if (Subtarget.is64Bit()) {
415 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
417 }
418 }
419
420 if (Subtarget.hasLZCNT()) {
421 // When promoting the i8 variants, force them to i32 for a shorter
422 // encoding.
423 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
425 } else {
426 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
427 if (VT == MVT::i64 && !Subtarget.is64Bit())
428 continue;
431 }
432 }
433
436 // Special handling for half-precision floating point conversions.
437 // If we don't have F16C support, then lower half float conversions
438 // into library calls.
440 Op, MVT::f32,
441 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
442 // There's never any support for operations beyond MVT::f32.
443 setOperationAction(Op, MVT::f64, Expand);
444 setOperationAction(Op, MVT::f80, Expand);
445 setOperationAction(Op, MVT::f128, Expand);
446 }
447
448 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
451 }
452
453 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
454 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
455 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
456 setTruncStoreAction(VT, MVT::f16, Expand);
457 setTruncStoreAction(VT, MVT::bf16, Expand);
458
461 }
462
466 if (Subtarget.is64Bit())
468 if (Subtarget.hasPOPCNT()) {
469 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
470 // popcntw is longer to encode than popcntl and also has a false dependency
471 // on the dest that popcntl hasn't had since Cannon Lake.
472 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
473 } else {
478 }
479
481
482 if (!Subtarget.hasMOVBE())
484
485 // X86 wants to expand cmov itself.
486 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
491 }
492 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
493 if (VT == MVT::i64 && !Subtarget.is64Bit())
494 continue;
497 }
498
499 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
502
504 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
505 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
509 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
510 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
511
512 // Darwin ABI issue.
513 for (auto VT : { MVT::i32, MVT::i64 }) {
514 if (VT == MVT::i64 && !Subtarget.is64Bit())
515 continue;
522 }
523
524 // 64-bit shl, sra, srl (iff 32-bit x86)
525 for (auto VT : { MVT::i32, MVT::i64 }) {
526 if (VT == MVT::i64 && !Subtarget.is64Bit())
527 continue;
531 }
532
533 if (Subtarget.hasSSEPrefetch())
535
537
538 // Expand certain atomics
539 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
547 }
548
549 if (!Subtarget.is64Bit())
551
552 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
553 // All CPUs supporting AVX will atomically load/store aligned 128-bit
554 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 }
558
559 if (Subtarget.canUseCMPXCHG16B())
561
562 // FIXME - use subtarget debug flags
563 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
564 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
565 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
567 }
568
571
574
575 setOperationAction(ISD::TRAP, MVT::Other, Legal);
577 if (Subtarget.isTargetPS())
579 else
581
582 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
584 setOperationAction(ISD::VAEND , MVT::Other, Expand);
585 bool Is64Bit = Subtarget.is64Bit();
586 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
587 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
588
591
593
594 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597
599
600 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
601 setOperationAction(ISD::FABS, VT, Action);
602 setOperationAction(ISD::FNEG, VT, Action);
604 setOperationAction(ISD::FREM, VT, Action);
605 setOperationAction(ISD::FMA, VT, Action);
606 setOperationAction(ISD::FMINNUM, VT, Action);
607 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FSIN, VT, Action);
611 setOperationAction(ISD::FCOS, VT, Action);
612 setOperationAction(ISD::FSINCOS, VT, Action);
613 setOperationAction(ISD::FTAN, VT, Action);
614 setOperationAction(ISD::FSQRT, VT, Action);
615 setOperationAction(ISD::FPOW, VT, Action);
616 setOperationAction(ISD::FLOG, VT, Action);
617 setOperationAction(ISD::FLOG2, VT, Action);
618 setOperationAction(ISD::FLOG10, VT, Action);
619 setOperationAction(ISD::FEXP, VT, Action);
620 setOperationAction(ISD::FEXP2, VT, Action);
621 setOperationAction(ISD::FEXP10, VT, Action);
622 setOperationAction(ISD::FCEIL, VT, Action);
623 setOperationAction(ISD::FFLOOR, VT, Action);
625 setOperationAction(ISD::FRINT, VT, Action);
626 setOperationAction(ISD::BR_CC, VT, Action);
627 setOperationAction(ISD::SETCC, VT, Action);
630 setOperationAction(ISD::FROUND, VT, Action);
632 setOperationAction(ISD::FTRUNC, VT, Action);
633 setOperationAction(ISD::FLDEXP, VT, Action);
634 };
635
636 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
637 // f16, f32 and f64 use SSE.
638 // Set up the FP register classes.
639 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
640 : &X86::FR16RegClass);
641 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
642 : &X86::FR32RegClass);
643 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
644 : &X86::FR64RegClass);
645
646 // Disable f32->f64 extload as we can only generate this in one instruction
647 // under optsize. So its easier to pattern match (fpext (load)) for that
648 // case instead of needing to emit 2 instructions for extload in the
649 // non-optsize case.
650 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
651
652 for (auto VT : { MVT::f32, MVT::f64 }) {
653 // Use ANDPD to simulate FABS.
655
656 // Use XORP to simulate FNEG.
658
659 // Use ANDPD and ORPD to simulate FCOPYSIGN.
661
662 // These might be better off as horizontal vector ops.
665
666 // We don't support sin/cos/fmod
670 }
671
672 // Half type will be promoted by default.
673 setF16Action(MVT::f16, Promote);
681
711
712 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
713 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
714
715 // Lower this to MOVMSK plus an AND.
718
719 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
720 (UseX87 || Is64Bit)) {
721 // Use SSE for f32, x87 for f64.
722 // Set up the FP register classes.
723 addRegisterClass(MVT::f32, &X86::FR32RegClass);
724 if (UseX87)
725 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
726
727 // Use ANDPS to simulate FABS.
729
730 // Use XORP to simulate FNEG.
732
733 if (UseX87)
735
736 // Use ANDPS and ORPS to simulate FCOPYSIGN.
737 if (UseX87)
740
741 // We don't support sin/cos/fmod
745
746 if (UseX87) {
747 // Always expand sin/cos functions even though x87 has an instruction.
751 }
752 } else if (UseX87) {
753 // f32 and f64 in x87.
754 // Set up the FP register classes.
755 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
756 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
757
758 for (auto VT : { MVT::f32, MVT::f64 }) {
761
762 // Always expand sin/cos functions even though x87 has an instruction.
766 }
767 }
768
769 // Expand FP32 immediates into loads from the stack, save special cases.
770 if (isTypeLegal(MVT::f32)) {
771 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
772 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
773 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
774 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
775 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
776 } else // SSE immediates.
777 addLegalFPImmediate(APFloat(+0.0f)); // xorps
778 }
779 // Expand FP64 immediates into loads from the stack, save special cases.
780 if (isTypeLegal(MVT::f64)) {
781 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
782 addLegalFPImmediate(APFloat(+0.0)); // FLD0
783 addLegalFPImmediate(APFloat(+1.0)); // FLD1
784 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
785 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
786 } else // SSE immediates.
787 addLegalFPImmediate(APFloat(+0.0)); // xorpd
788 }
789 // Support fp16 0 immediate.
790 if (isTypeLegal(MVT::f16))
791 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
792
793 // Handle constrained floating-point operations of scalar.
806
807 // We don't support FMA.
810
811 // f80 always uses X87.
812 if (UseX87) {
813 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
816 {
818 addLegalFPImmediate(TmpFlt); // FLD0
819 TmpFlt.changeSign();
820 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
821
822 bool ignored;
823 APFloat TmpFlt2(+1.0);
825 &ignored);
826 addLegalFPImmediate(TmpFlt2); // FLD1
827 TmpFlt2.changeSign();
828 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
829 }
830
831 // Always expand sin/cos functions even though x87 has an instruction.
832 // clang-format off
843 // clang-format on
844
856
857 // Handle constrained floating-point operations of scalar.
863 if (isTypeLegal(MVT::f16)) {
866 } else {
868 }
869 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
870 // as Custom.
872 }
873
874 // f128 uses xmm registers, but most operations require libcalls.
875 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
876 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
877 : &X86::VR128RegClass);
878
879 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
880
891
895
896 // clang-format off
904 // clang-format on
905 // No STRICT_FSINCOS
908
911 // We need to custom handle any FP_ROUND with an f128 input, but
912 // LegalizeDAG uses the result type to know when to run a custom handler.
913 // So we have to list all legal floating point result types here.
914 if (isTypeLegal(MVT::f32)) {
917 }
918 if (isTypeLegal(MVT::f64)) {
921 }
922 if (isTypeLegal(MVT::f80)) {
925 }
926
928
929 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
930 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
931 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
932 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
933 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
934 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
935 }
936
937 // Always use a library call for pow.
938 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
939 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
940 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
941 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
942
951
952 // Some FP actions are always expanded for vector types.
953 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
954 MVT::v4f32, MVT::v8f32, MVT::v16f32,
955 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
956 // clang-format off
970 // clang-format on
971 }
972
973 // First set operation action for all vector types to either promote
974 // (for widening) or expand (for scalarization). Then we will selectively
975 // turn on ones that can be effectively codegen'd.
1015 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1016 setTruncStoreAction(InnerVT, VT, Expand);
1017
1018 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1019 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1020
1021 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1022 // types, we have to deal with them whether we ask for Expansion or not.
1023 // Setting Expand causes its own optimisation problems though, so leave
1024 // them legal.
1025 if (VT.getVectorElementType() == MVT::i1)
1026 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1027
1028 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1029 // split/scalarized right now.
1030 if (VT.getVectorElementType() == MVT::f16 ||
1031 VT.getVectorElementType() == MVT::bf16)
1032 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1033 }
1034 }
1035
1036 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1037 // with -msoft-float, disable use of MMX as well.
1038 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1039 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1040 // No operations on x86mmx supported, everything uses intrinsics.
1041 }
1042
1043 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1044 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1045 : &X86::VR128RegClass);
1046
1049
1050 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1051 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1058
1059 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1060 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1061
1067 }
1068
1069 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1070 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1071 : &X86::VR128RegClass);
1072
1073 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1074 // registers cannot be used even for integer operations.
1075 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1076 : &X86::VR128RegClass);
1077 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1078 : &X86::VR128RegClass);
1079 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1080 : &X86::VR128RegClass);
1081 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1082 : &X86::VR128RegClass);
1083 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1084 : &X86::VR128RegClass);
1085
1086 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1089 }
1090
1091 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1092 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1097 }
1098
1099 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1100 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1101 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1102
1103 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1104 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1105 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1106 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1107 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1108 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1109 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1110 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1111 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1112 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1115
1116 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1117 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1118 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1119
1120 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1121 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1123
1124 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1125
1126 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1127 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1128 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1129 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1130 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1131 }
1132
1143
1148
1149 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1155
1156 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1157 // setcc all the way to isel and prefer SETGT in some isel patterns.
1160 }
1161
1162 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1163 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1168
1169 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1175 }
1176
1177 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1181
1182 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1183 continue;
1184
1187 }
1188 setF16Action(MVT::v8f16, Expand);
1189 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1190 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1191 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1192 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1193 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1194 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1196
1197 // Custom lower v2i64 and v2f64 selects.
1204
1211
1212 // Custom legalize these to avoid over promotion or custom promotion.
1213 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1218 }
1219
1224
1227
1230
1231 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1236
1241
1242 // We want to legalize this to an f64 load rather than an i64 load on
1243 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1244 // store.
1245 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1246 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1247 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1248 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1249 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1251
1252 // Add 32-bit vector stores to help vectorization opportunities.
1253 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1255
1259 if (!Subtarget.hasAVX512())
1261
1265
1267
1284
1285 // In the customized shift lowering, the legal v4i32/v2i64 cases
1286 // in AVX2 will be recognized.
1287 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1291 if (VT == MVT::v2i64) continue;
1296 }
1297
1303 }
1304
1305 if (Subtarget.hasGFNI()) {
1310 }
1311
1312 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1313 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1314 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1315 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1316
1317 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1320 }
1321
1322 // These might be better off as horizontal vector ops.
1327 }
1328
1329 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1330 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1333 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1337 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1343
1345 }
1346
1347 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1348 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1349 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1350 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1351 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1352 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1353 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1354 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1355
1359
1360 // FIXME: Do we need to handle scalar-to-vector here?
1361 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1362 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1363
1364 // We directly match byte blends in the backend as they match the VSELECT
1365 // condition form.
1367
1368 // SSE41 brings specific instructions for doing vector sign extend even in
1369 // cases where we don't have SRA.
1370 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1373 }
1374
1375 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1376 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1377 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1378 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1379 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1380 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1381 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1382 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1383 }
1384
1385 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1386 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1387 // do the pre and post work in the vector domain.
1390 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1391 // so that DAG combine doesn't try to turn it into uint_to_fp.
1394 }
1395 }
1396
1397 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1399 }
1400
1401 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1402 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1403 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1406 }
1407
1408 // XOP can efficiently perform BITREVERSE with VPPERM.
1409 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1411 }
1412
1413 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1414 bool HasInt256 = Subtarget.hasInt256();
1415
1416 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1417 : &X86::VR256RegClass);
1418 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1419 : &X86::VR256RegClass);
1420 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1421 : &X86::VR256RegClass);
1422 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1423 : &X86::VR256RegClass);
1424 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1425 : &X86::VR256RegClass);
1426 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1427 : &X86::VR256RegClass);
1428 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1429 : &X86::VR256RegClass);
1430
1431 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1444
1446
1450
1453 }
1454
1455 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1456 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1457
1458 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1459 // even though v8i16 is a legal type.
1460 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1461 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1462 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1463 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1467
1474
1486
1487 if (!Subtarget.hasAVX512())
1489
1490 // In the customized shift lowering, the legal v8i32/v4i64 cases
1491 // in AVX2 will be recognized.
1492 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1498 if (VT == MVT::v4i64) continue;
1503 }
1504
1505 // These types need custom splitting if their input is a 128-bit vector.
1510
1514 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1515 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1518
1519 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1523 }
1524
1529
1530 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1535
1536 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1537 // setcc all the way to isel and prefer SETGT in some isel patterns.
1540 }
1541
1542 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1543 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1548
1549 if (Subtarget.hasAnyFMA()) {
1550 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1551 MVT::v2f64, MVT::v4f64 }) {
1554 }
1555 }
1556
1557 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1558 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1559 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1560 }
1561
1562 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1563 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1564 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1565 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1566
1567 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1568 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1569 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1570 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1571 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1572 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1573 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1574 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1575
1576 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1577 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1578
1579 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1580 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1581 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1582 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1583 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1584
1585 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1586 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1587 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1588 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1589 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1592 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1597
1598 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1599 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1601 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1602 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1603 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1604 }
1605
1606 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1609 }
1610
1611 if (HasInt256) {
1612 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1613 // when we have a 256bit-wide blend with immediate.
1616
1617 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1618 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1619 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1620 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1621 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1622 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1623 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1624 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1625 }
1626 }
1627
1628 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1629 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1630 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1632 }
1633
1634 // Extract subvector is special because the value type
1635 // (result) is 128-bit but the source is 256-bit wide.
1636 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1637 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1639 }
1640
1641 // Custom lower several nodes for 256-bit types.
1642 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1643 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1653 }
1654 setF16Action(MVT::v16f16, Expand);
1655 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1656 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1658 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1659 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1660 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1661 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1662
1663 if (HasInt256) {
1665
1666 // Custom legalize 2x32 to get a little better code.
1669
1670 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1671 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1673 }
1674 }
1675
1676 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1677 Subtarget.hasF16C()) {
1678 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1681 }
1682 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1685 }
1686 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1687 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1688 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1689 }
1690 }
1691
1692 // This block controls legalization of the mask vector sizes that are
1693 // available with AVX512. 512-bit vectors are in a separate block controlled
1694 // by useAVX512Regs.
1695 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1696 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1697 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1698 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1699 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1700 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1701
1705
1706 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1707 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1708 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1709 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1710 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1711 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1712 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1713 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1718
1719 // There is no byte sized k-register load or store without AVX512DQ.
1720 if (!Subtarget.hasDQI()) {
1721 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1722 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1723 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1724 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1725
1730 }
1731
1732 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1733 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1737 }
1738
1739 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1741
1742 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1746
1753 }
1754
1755 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1757 }
1758 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1759 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1762 }
1763 }
1764
1765 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1766 // elements. 512-bits can be disabled based on prefer-vector-width and
1767 // required-vector-width function attributes.
1768 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1769 bool HasBWI = Subtarget.hasBWI();
1770
1771 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1772 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1773 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1774 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1775 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1776 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1777 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1778
1779 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1780 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1781 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1782 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1783 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1784 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1785 if (HasBWI)
1786 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1787 }
1788
1789 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1797 }
1798 setOperationAction(ISD::LRINT, MVT::v16f32,
1799 Subtarget.hasDQI() ? Legal : Custom);
1800 setOperationAction(ISD::LRINT, MVT::v8f64,
1801 Subtarget.hasDQI() ? Legal : Custom);
1802 if (Subtarget.hasDQI())
1803 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1804
1805 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1810 }
1811
1812 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1817 }
1818
1825
1837
1838 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1839 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1840 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1841 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1842 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1843 if (HasBWI)
1844 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1845
1846 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1847 // to 512-bit rather than use the AVX2 instructions so that we can use
1848 // k-masks.
1849 if (!Subtarget.hasVLX()) {
1850 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1851 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1854 }
1855 }
1856
1858 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1859 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1869
1870 if (HasBWI) {
1871 // Extends from v64i1 masks to 512-bit vectors.
1875 }
1876
1877 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1890
1892 }
1893
1894 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1897 }
1898
1899 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1900 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1901 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1902 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1903
1904 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1905 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1906 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1907 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1908
1909 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1910 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1911 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1912 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1913 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1914 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1915 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1916 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1917
1918 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1919 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1920
1921 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1931
1932 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1933 // setcc all the way to isel and prefer SETGT in some isel patterns.
1936 }
1937
1938 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1939 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1944
1945 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1952 }
1953
1954 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1955 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1956 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1958 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1959 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1960 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1961 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1966 }
1967
1968 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1969 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1970 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1971 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1972 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1973 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1974
1975 if (Subtarget.hasDQI()) {
1979 setOperationAction(Opc, MVT::v8i64, Custom);
1980 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1981 }
1982
1983 if (Subtarget.hasCDI()) {
1984 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1985 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1987 }
1988 } // Subtarget.hasCDI()
1989
1990 if (Subtarget.hasVPOPCNTDQ()) {
1991 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1993 }
1994
1995 // Extract subvector is special because the value type
1996 // (result) is 256-bit but the source is 512-bit wide.
1997 // 128-bit was made Legal under AVX1.
1998 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1999 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2001
2002 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2003 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2013 }
2014 setF16Action(MVT::v32f16, Expand);
2019 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2020 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2021
2022 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2027 }
2028 if (HasBWI) {
2029 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2032 }
2033 } else {
2034 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2035 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2036 }
2037
2038 if (Subtarget.hasVBMI2()) {
2039 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2042 }
2043
2044 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2045 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2046 }
2047
2048 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2049 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2051 }// useAVX512Regs
2052
2053 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2054 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2055 MVT::v4i64}) {
2058 }
2059 }
2060
2061 // This block controls legalization for operations that don't have
2062 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2063 // narrower widths.
2064 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2065 // These operations are handled on non-VLX by artificially widening in
2066 // isel patterns.
2067
2071
2072 if (Subtarget.hasDQI()) {
2073 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2074 // v2f32 UINT_TO_FP is already custom under SSE2.
2077 "Unexpected operation action!");
2078 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2083 }
2084
2085 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2091 }
2092
2093 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2096 }
2097
2098 // Custom legalize 2x32 to get a little better code.
2101
2102 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2103 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2105
2106 if (Subtarget.hasDQI()) {
2110 setOperationAction(Opc, MVT::v2i64, Custom);
2111 setOperationAction(Opc, MVT::v4i64, Custom);
2112 }
2113 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2114 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2115 }
2116
2117 if (Subtarget.hasCDI()) {
2118 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2120 }
2121 } // Subtarget.hasCDI()
2122
2123 if (Subtarget.hasVPOPCNTDQ()) {
2124 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2126 }
2127 }
2128
2129 // This block control legalization of v32i1/v64i1 which are available with
2130 // AVX512BW..
2131 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2132 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2133 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2134
2135 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2146 }
2147
2148 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2150
2151 // Extends from v32i1 masks to 256-bit vectors.
2155
2156 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2157 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2158 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2159 }
2160
2161 // These operations are handled on non-VLX by artificially widening in
2162 // isel patterns.
2163 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2164
2165 if (Subtarget.hasBITALG()) {
2166 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2168 }
2169 }
2170
2171 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2172 auto setGroup = [&] (MVT VT) {
2183
2196
2198
2201
2207
2213
2217 };
2218
2219 // AVX512_FP16 scalar operations
2220 setGroup(MVT::f16);
2234
2237
2238 if (Subtarget.useAVX512Regs()) {
2239 setGroup(MVT::v32f16);
2245 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2252
2257 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2259 MVT::v32i16);
2260 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2262 MVT::v32i16);
2263 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2265 MVT::v32i16);
2266 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2268 MVT::v32i16);
2269
2273
2274 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2275 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2276 }
2277
2278 if (Subtarget.hasVLX()) {
2279 setGroup(MVT::v8f16);
2280 setGroup(MVT::v16f16);
2281
2292
2303
2304 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2307
2311
2312 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2313 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2314 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2315 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2316
2317 // Need to custom widen these to prevent scalarization.
2318 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2319 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2320 }
2321 }
2322
2323 if (!Subtarget.useSoftFloat() &&
2324 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2325 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2326 : &X86::VR128RegClass);
2327 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2328 : &X86::VR256RegClass);
2329 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2330 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2331 // Set the operation action Custom to do the customization later.
2334 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2335 setF16Action(VT, Expand);
2340 }
2341 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2342 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2343 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2344 }
2346 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2347 }
2348
2349 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2350 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2351 setF16Action(MVT::v32bf16, Expand);
2352 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2353 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2355 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2359 }
2360
2361 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2362 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2363 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2365 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2366 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2367
2368 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2369 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2370 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2371 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2372 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2373
2374 if (Subtarget.hasBWI()) {
2375 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2376 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2377 }
2378
2379 if (Subtarget.hasFP16()) {
2380 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2389 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2398 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2403 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2408 }
2409 }
2410
2411 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2412 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2413 }
2414
2415 // We want to custom lower some of our intrinsics.
2419 if (!Subtarget.is64Bit()) {
2421 }
2422
2423 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2424 // handle type legalization for these operations here.
2425 //
2426 // FIXME: We really should do custom legalization for addition and
2427 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2428 // than generic legalization for 64-bit multiplication-with-overflow, though.
2429 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2430 if (VT == MVT::i64 && !Subtarget.is64Bit())
2431 continue;
2432 // Add/Sub/Mul with overflow operations are custom lowered.
2439
2440 // Support carry in as value rather than glue.
2446 }
2447
2448 // Combine sin / cos into _sincos_stret if it is available.
2449 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2450 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2453 }
2454
2455 if (Subtarget.isTargetWin64()) {
2456 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2457 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2458 setOperationAction(ISD::SREM, MVT::i128, Custom);
2459 setOperationAction(ISD::UREM, MVT::i128, Custom);
2468 }
2469
2470 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2471 // is. We should promote the value to 64-bits to solve this.
2472 // This is what the CRT headers do - `fmodf` is an inline header
2473 // function casting to f64 and calling `fmod`.
2474 if (Subtarget.is32Bit() &&
2475 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2476 // clang-format off
2477 for (ISD::NodeType Op :
2488 if (isOperationExpand(Op, MVT::f32))
2489 setOperationAction(Op, MVT::f32, Promote);
2490 // clang-format on
2491
2492 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2493 // it, but it's just a wrapper around ldexp.
2494 if (Subtarget.isOSWindows()) {
2496 if (isOperationExpand(Op, MVT::f32))
2497 setOperationAction(Op, MVT::f32, Promote);
2498 }
2499
2500 // We have target-specific dag combine patterns for the following nodes:
2511 ISD::SHL,
2512 ISD::SRA,
2513 ISD::SRL,
2514 ISD::OR,
2515 ISD::AND,
2521 ISD::ADD,
2522 ISD::FADD,
2523 ISD::FSUB,
2524 ISD::FNEG,
2525 ISD::FMA,
2529 ISD::SUB,
2530 ISD::LOAD,
2531 ISD::LRINT,
2533 ISD::MLOAD,
2534 ISD::STORE,
2548 ISD::SETCC,
2549 ISD::MUL,
2550 ISD::XOR,
2558
2560
2561 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2563 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2565 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2567
2568 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2569 // that needs to benchmarked and balanced with the potential use of vector
2570 // load/store types (PR33329, PR33914).
2573
2574 // Default loop alignment, which can be overridden by -align-loops.
2576
2577 // An out-of-order CPU can speculatively execute past a predictable branch,
2578 // but a conditional move could be stalled by an expensive earlier operation.
2579 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2580 EnableExtLdPromotion = true;
2582
2584
2585 // Default to having -disable-strictnode-mutation on
2586 IsStrictFPEnabled = true;
2587}
2588
2589// This has so far only been implemented for 64-bit MachO.
2591 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2592}
2593
2595 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2596 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2597}
2598
2600 const SDLoc &DL) const {
2601 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2602 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2603 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2604 return SDValue(Node, 0);
2605}
2606
2609 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2610 !Subtarget.hasBWI())
2611 return TypeSplitVector;
2612
2613 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2614 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2615 return TypeSplitVector;
2616
2617 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2618 VT.getVectorElementType() != MVT::i1)
2619 return TypeWidenVector;
2620
2622}
2623
2624FastISel *
2626 const TargetLibraryInfo *libInfo) const {
2627 return X86::createFastISel(funcInfo, libInfo);
2628}
2629
2630//===----------------------------------------------------------------------===//
2631// Other Lowering Hooks
2632//===----------------------------------------------------------------------===//
2633
2635 bool AssumeSingleUse) {
2636 if (!AssumeSingleUse && !Op.hasOneUse())
2637 return false;
2638 if (!ISD::isNormalLoad(Op.getNode()))
2639 return false;
2640
2641 // If this is an unaligned vector, make sure the target supports folding it.
2642 auto *Ld = cast<LoadSDNode>(Op.getNode());
2643 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2644 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2645 return false;
2646
2647 // TODO: If this is a non-temporal load and the target has an instruction
2648 // for it, it should not be folded. See "useNonTemporalLoad()".
2649
2650 return true;
2651}
2652
2654 const X86Subtarget &Subtarget,
2655 bool AssumeSingleUse) {
2656 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2657 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2658 return false;
2659
2660 // We can not replace a wide volatile load with a broadcast-from-memory,
2661 // because that would narrow the load, which isn't legal for volatiles.
2662 auto *Ld = cast<LoadSDNode>(Op.getNode());
2663 return !Ld->isVolatile() ||
2664 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2665}
2666
2668 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2669}
2670
2672 if (Op.hasOneUse()) {
2673 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
2674 return (ISD::ZERO_EXTEND == Opcode);
2675 }
2676 return false;
2677}
2678
2679static bool isLogicOp(unsigned Opcode) {
2680 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2681 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2682}
2683
2684static bool isTargetShuffle(unsigned Opcode) {
2685 switch(Opcode) {
2686 default: return false;
2687 case X86ISD::BLENDI:
2688 case X86ISD::PSHUFB:
2689 case X86ISD::PSHUFD:
2690 case X86ISD::PSHUFHW:
2691 case X86ISD::PSHUFLW:
2692 case X86ISD::SHUFP:
2693 case X86ISD::INSERTPS:
2694 case X86ISD::EXTRQI:
2695 case X86ISD::INSERTQI:
2696 case X86ISD::VALIGN:
2697 case X86ISD::PALIGNR:
2698 case X86ISD::VSHLDQ:
2699 case X86ISD::VSRLDQ:
2700 case X86ISD::MOVLHPS:
2701 case X86ISD::MOVHLPS:
2702 case X86ISD::MOVSHDUP:
2703 case X86ISD::MOVSLDUP:
2704 case X86ISD::MOVDDUP:
2705 case X86ISD::MOVSS:
2706 case X86ISD::MOVSD:
2707 case X86ISD::MOVSH:
2708 case X86ISD::UNPCKL:
2709 case X86ISD::UNPCKH:
2710 case X86ISD::VBROADCAST:
2711 case X86ISD::VPERMILPI:
2712 case X86ISD::VPERMILPV:
2713 case X86ISD::VPERM2X128:
2714 case X86ISD::SHUF128:
2715 case X86ISD::VPERMIL2:
2716 case X86ISD::VPERMI:
2717 case X86ISD::VPPERM:
2718 case X86ISD::VPERMV:
2719 case X86ISD::VPERMV3:
2720 case X86ISD::VZEXT_MOVL:
2721 return true;
2722 }
2723}
2724
2725static bool isTargetShuffleVariableMask(unsigned Opcode) {
2726 switch (Opcode) {
2727 default: return false;
2728 // Target Shuffles.
2729 case X86ISD::PSHUFB:
2730 case X86ISD::VPERMILPV:
2731 case X86ISD::VPERMIL2:
2732 case X86ISD::VPPERM:
2733 case X86ISD::VPERMV:
2734 case X86ISD::VPERMV3:
2735 return true;
2736 // 'Faux' Target Shuffles.
2737 case ISD::OR:
2738 case ISD::AND:
2739 case X86ISD::ANDNP:
2740 return true;
2741 }
2742}
2743
2746 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2748 int ReturnAddrIndex = FuncInfo->getRAIndex();
2749
2750 if (ReturnAddrIndex == 0) {
2751 // Set up a frame object for the return address.
2752 unsigned SlotSize = RegInfo->getSlotSize();
2753 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2754 -(int64_t)SlotSize,
2755 false);
2756 FuncInfo->setRAIndex(ReturnAddrIndex);
2757 }
2758
2759 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2760}
2761
2763 bool HasSymbolicDisplacement) {
2764 // Offset should fit into 32 bit immediate field.
2765 if (!isInt<32>(Offset))
2766 return false;
2767
2768 // If we don't have a symbolic displacement - we don't have any extra
2769 // restrictions.
2770 if (!HasSymbolicDisplacement)
2771 return true;
2772
2773 // We can fold large offsets in the large code model because we always use
2774 // 64-bit offsets.
2775 if (CM == CodeModel::Large)
2776 return true;
2777
2778 // For kernel code model we know that all object resist in the negative half
2779 // of 32bits address space. We may not accept negative offsets, since they may
2780 // be just off and we may accept pretty large positive ones.
2781 if (CM == CodeModel::Kernel)
2782 return Offset >= 0;
2783
2784 // For other non-large code models we assume that latest small object is 16MB
2785 // before end of 31 bits boundary. We may also accept pretty large negative
2786 // constants knowing that all objects are in the positive half of address
2787 // space.
2788 return Offset < 16 * 1024 * 1024;
2789}
2790
2791/// Return true if the condition is an signed comparison operation.
2792static bool isX86CCSigned(unsigned X86CC) {
2793 switch (X86CC) {
2794 default:
2795 llvm_unreachable("Invalid integer condition!");
2796 case X86::COND_E:
2797 case X86::COND_NE:
2798 case X86::COND_B:
2799 case X86::COND_A:
2800 case X86::COND_BE:
2801 case X86::COND_AE:
2802 return false;
2803 case X86::COND_G:
2804 case X86::COND_GE:
2805 case X86::COND_L:
2806 case X86::COND_LE:
2807 return true;
2808 }
2809}
2810
2812 switch (SetCCOpcode) {
2813 // clang-format off
2814 default: llvm_unreachable("Invalid integer condition!");
2815 case ISD::SETEQ: return X86::COND_E;
2816 case ISD::SETGT: return X86::COND_G;
2817 case ISD::SETGE: return X86::COND_GE;
2818 case ISD::SETLT: return X86::COND_L;
2819 case ISD::SETLE: return X86::COND_LE;
2820 case ISD::SETNE: return X86::COND_NE;
2821 case ISD::SETULT: return X86::COND_B;
2822 case ISD::SETUGT: return X86::COND_A;
2823 case ISD::SETULE: return X86::COND_BE;
2824 case ISD::SETUGE: return X86::COND_AE;
2825 // clang-format on
2826 }
2827}
2828
2829/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2830/// condition code, returning the condition code and the LHS/RHS of the
2831/// comparison to make.
2833 bool isFP, SDValue &LHS, SDValue &RHS,
2834 SelectionDAG &DAG) {
2835 if (!isFP) {
2836 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2837 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2838 // X > -1 -> X == 0, jump !sign.
2839 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2840 return X86::COND_NS;
2841 }
2842 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2843 // X < 0 -> X == 0, jump on sign.
2844 return X86::COND_S;
2845 }
2846 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2847 // X >= 0 -> X == 0, jump on !sign.
2848 return X86::COND_NS;
2849 }
2850 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2851 // X < 1 -> X <= 0
2852 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2853 return X86::COND_LE;
2854 }
2855 }
2856
2857 return TranslateIntegerX86CC(SetCCOpcode);
2858 }
2859
2860 // First determine if it is required or is profitable to flip the operands.
2861
2862 // If LHS is a foldable load, but RHS is not, flip the condition.
2863 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2864 !ISD::isNON_EXTLoad(RHS.getNode())) {
2865 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2866 std::swap(LHS, RHS);
2867 }
2868
2869 switch (SetCCOpcode) {
2870 default: break;
2871 case ISD::SETOLT:
2872 case ISD::SETOLE:
2873 case ISD::SETUGT:
2874 case ISD::SETUGE:
2875 std::swap(LHS, RHS);
2876 break;
2877 }
2878
2879 // On a floating point condition, the flags are set as follows:
2880 // ZF PF CF op
2881 // 0 | 0 | 0 | X > Y
2882 // 0 | 0 | 1 | X < Y
2883 // 1 | 0 | 0 | X == Y
2884 // 1 | 1 | 1 | unordered
2885 switch (SetCCOpcode) {
2886 // clang-format off
2887 default: llvm_unreachable("Condcode should be pre-legalized away");
2888 case ISD::SETUEQ:
2889 case ISD::SETEQ: return X86::COND_E;
2890 case ISD::SETOLT: // flipped
2891 case ISD::SETOGT:
2892 case ISD::SETGT: return X86::COND_A;
2893 case ISD::SETOLE: // flipped
2894 case ISD::SETOGE:
2895 case ISD::SETGE: return X86::COND_AE;
2896 case ISD::SETUGT: // flipped
2897 case ISD::SETULT:
2898 case ISD::SETLT: return X86::COND_B;
2899 case ISD::SETUGE: // flipped
2900 case ISD::SETULE:
2901 case ISD::SETLE: return X86::COND_BE;
2902 case ISD::SETONE:
2903 case ISD::SETNE: return X86::COND_NE;
2904 case ISD::SETUO: return X86::COND_P;
2905 case ISD::SETO: return X86::COND_NP;
2906 case ISD::SETOEQ:
2907 case ISD::SETUNE: return X86::COND_INVALID;
2908 // clang-format on
2909 }
2910}
2911
2912/// Is there a floating point cmov for the specific X86 condition code?
2913/// Current x86 isa includes the following FP cmov instructions:
2914/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2915static bool hasFPCMov(unsigned X86CC) {
2916 switch (X86CC) {
2917 default:
2918 return false;
2919 case X86::COND_B:
2920 case X86::COND_BE:
2921 case X86::COND_E:
2922 case X86::COND_P:
2923 case X86::COND_A:
2924 case X86::COND_AE:
2925 case X86::COND_NE:
2926 case X86::COND_NP:
2927 return true;
2928 }
2929}
2930
2931static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
2932 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
2933 VT.is512BitVector();
2934}
2935
2937 const CallInst &I,
2938 MachineFunction &MF,
2939 unsigned Intrinsic) const {
2941 Info.offset = 0;
2942
2943 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
2944 if (!IntrData) {
2945 switch (Intrinsic) {
2946 case Intrinsic::x86_aesenc128kl:
2947 case Intrinsic::x86_aesdec128kl:
2949 Info.ptrVal = I.getArgOperand(1);
2950 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2951 Info.align = Align(1);
2953 return true;
2954 case Intrinsic::x86_aesenc256kl:
2955 case Intrinsic::x86_aesdec256kl:
2957 Info.ptrVal = I.getArgOperand(1);
2958 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2959 Info.align = Align(1);
2961 return true;
2962 case Intrinsic::x86_aesencwide128kl:
2963 case Intrinsic::x86_aesdecwide128kl:
2965 Info.ptrVal = I.getArgOperand(0);
2966 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2967 Info.align = Align(1);
2969 return true;
2970 case Intrinsic::x86_aesencwide256kl:
2971 case Intrinsic::x86_aesdecwide256kl:
2973 Info.ptrVal = I.getArgOperand(0);
2974 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2975 Info.align = Align(1);
2977 return true;
2978 case Intrinsic::x86_cmpccxadd32:
2979 case Intrinsic::x86_cmpccxadd64:
2980 case Intrinsic::x86_atomic_bts:
2981 case Intrinsic::x86_atomic_btc:
2982 case Intrinsic::x86_atomic_btr: {
2984 Info.ptrVal = I.getArgOperand(0);
2985 unsigned Size = I.getType()->getScalarSizeInBits();
2986 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2987 Info.align = Align(Size);
2990 return true;
2991 }
2992 case Intrinsic::x86_atomic_bts_rm:
2993 case Intrinsic::x86_atomic_btc_rm:
2994 case Intrinsic::x86_atomic_btr_rm: {
2996 Info.ptrVal = I.getArgOperand(0);
2997 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2998 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2999 Info.align = Align(Size);
3002 return true;
3003 }
3004 case Intrinsic::x86_aadd32:
3005 case Intrinsic::x86_aadd64:
3006 case Intrinsic::x86_aand32:
3007 case Intrinsic::x86_aand64:
3008 case Intrinsic::x86_aor32:
3009 case Intrinsic::x86_aor64:
3010 case Intrinsic::x86_axor32:
3011 case Intrinsic::x86_axor64:
3012 case Intrinsic::x86_atomic_add_cc:
3013 case Intrinsic::x86_atomic_sub_cc:
3014 case Intrinsic::x86_atomic_or_cc:
3015 case Intrinsic::x86_atomic_and_cc:
3016 case Intrinsic::x86_atomic_xor_cc: {
3018 Info.ptrVal = I.getArgOperand(0);
3019 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3020 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3021 Info.align = Align(Size);
3024 return true;
3025 }
3026 }
3027 return false;
3028 }
3029
3030 switch (IntrData->Type) {
3033 case TRUNCATE_TO_MEM_VI32: {
3035 Info.ptrVal = I.getArgOperand(0);
3036 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3038 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3039 ScalarVT = MVT::i8;
3040 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3041 ScalarVT = MVT::i16;
3042 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3043 ScalarVT = MVT::i32;
3044
3045 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3046 Info.align = Align(1);
3048 break;
3049 }
3050 case GATHER:
3051 case GATHER_AVX2: {
3053 Info.ptrVal = nullptr;
3054 MVT DataVT = MVT::getVT(I.getType());
3055 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3056 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3057 IndexVT.getVectorNumElements());
3058 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3059 Info.align = Align(1);
3061 break;
3062 }
3063 case SCATTER: {
3065 Info.ptrVal = nullptr;
3066 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3067 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3068 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3069 IndexVT.getVectorNumElements());
3070 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3071 Info.align = Align(1);
3073 break;
3074 }
3075 default:
3076 return false;
3077 }
3078
3079 return true;
3080}
3081
3082/// Returns true if the target can instruction select the
3083/// specified FP immediate natively. If false, the legalizer will
3084/// materialize the FP immediate as a load from a constant pool.
3086 bool ForCodeSize) const {
3087 for (const APFloat &FPImm : LegalFPImmediates)
3088 if (Imm.bitwiseIsEqual(FPImm))
3089 return true;
3090 return false;
3091}
3092
3094 ISD::LoadExtType ExtTy,
3095 EVT NewVT) const {
3096 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3097
3098 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3099 // relocation target a movq or addq instruction: don't let the load shrink.
3100 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3101 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3102 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3103 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3104
3105 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3106 // those uses are extracted directly into a store, then the extract + store
3107 // can be store-folded. Therefore, it's probably not worth splitting the load.
3108 EVT VT = Load->getValueType(0);
3109 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3110 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
3111 // Skip uses of the chain value. Result 0 of the node is the load value.
3112 if (UI.getUse().getResNo() != 0)
3113 continue;
3114
3115 // If this use is not an extract + store, it's probably worth splitting.
3116 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
3117 UI->use_begin()->getOpcode() != ISD::STORE)
3118 return true;
3119 }
3120 // All non-chain uses are extract + store.
3121 return false;
3122 }
3123
3124 return true;
3125}
3126
3127/// Returns true if it is beneficial to convert a load of a constant
3128/// to just the constant itself.
3130 Type *Ty) const {
3131 assert(Ty->isIntegerTy());
3132
3133 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3134 if (BitSize == 0 || BitSize > 64)
3135 return false;
3136 return true;
3137}
3138
3140 // If we are using XMM registers in the ABI and the condition of the select is
3141 // a floating-point compare and we have blendv or conditional move, then it is
3142 // cheaper to select instead of doing a cross-register move and creating a
3143 // load that depends on the compare result.
3144 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3145 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3146}
3147
3149 // TODO: It might be a win to ease or lift this restriction, but the generic
3150 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3151 if (VT.isVector() && Subtarget.hasAVX512())
3152 return false;
3153
3154 return true;
3155}
3156
3158 SDValue C) const {
3159 // TODO: We handle scalars using custom code, but generic combining could make
3160 // that unnecessary.
3161 APInt MulC;
3162 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3163 return false;
3164
3165 // Find the type this will be legalized too. Otherwise we might prematurely
3166 // convert this to shl+add/sub and then still have to type legalize those ops.
3167 // Another choice would be to defer the decision for illegal types until
3168 // after type legalization. But constant splat vectors of i64 can't make it
3169 // through type legalization on 32-bit targets so we would need to special
3170 // case vXi64.
3171 while (getTypeAction(Context, VT) != TypeLegal)
3172 VT = getTypeToTransformTo(Context, VT);
3173
3174 // If vector multiply is legal, assume that's faster than shl + add/sub.
3175 // Multiply is a complex op with higher latency and lower throughput in
3176 // most implementations, sub-vXi32 vector multiplies are always fast,
3177 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3178 // is always going to be slow.
3179 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3180 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3181 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3182 return false;
3183
3184 // shl+add, shl+sub, shl+add+neg
3185 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3186 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3187}
3188
3190 unsigned Index) const {
3192 return false;
3193
3194 // Mask vectors support all subregister combinations and operations that
3195 // extract half of vector.
3196 if (ResVT.getVectorElementType() == MVT::i1)
3197 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3198 (Index == ResVT.getVectorNumElements()));
3199
3200 return (Index % ResVT.getVectorNumElements()) == 0;
3201}
3202
3204 unsigned Opc = VecOp.getOpcode();
3205
3206 // Assume target opcodes can't be scalarized.
3207 // TODO - do we have any exceptions?
3208 if (Opc >= ISD::BUILTIN_OP_END)
3209 return false;
3210
3211 // If the vector op is not supported, try to convert to scalar.
3212 EVT VecVT = VecOp.getValueType();
3213 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3214 return true;
3215
3216 // If the vector op is supported, but the scalar op is not, the transform may
3217 // not be worthwhile.
3218 EVT ScalarVT = VecVT.getScalarType();
3219 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3220}
3221
3223 bool) const {
3224 // TODO: Allow vectors?
3225 if (VT.isVector())
3226 return false;
3227 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3228}
3229
3231 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
3232 return Subtarget.hasBMI() ||
3233 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
3234}
3235
3237 // Speculate ctlz only if we can directly use LZCNT.
3238 return Subtarget.hasLZCNT();
3239}
3240
3242 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3243 // expensive than a straight movsd. On the other hand, it's important to
3244 // shrink long double fp constant since fldt is very slow.
3245 return !Subtarget.hasSSE2() || VT == MVT::f80;
3246}
3247
3249 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3250 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3251}
3252
3254 const SelectionDAG &DAG,
3255 const MachineMemOperand &MMO) const {
3256 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3257 BitcastVT.getVectorElementType() == MVT::i1)
3258 return false;
3259
3260 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3261 return false;
3262
3263 // If both types are legal vectors, it's always ok to convert them.
3264 if (LoadVT.isVector() && BitcastVT.isVector() &&
3265 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3266 return true;
3267
3268 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3269}
3270
3272 const MachineFunction &MF) const {
3273 // Do not merge to float value size (128 bytes) if no implicit
3274 // float attribute is set.
3275 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3276
3277 if (NoFloat) {
3278 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3279 return (MemVT.getSizeInBits() <= MaxIntSize);
3280 }
3281 // Make sure we don't merge greater than our preferred vector
3282 // width.
3283 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3284 return false;
3285
3286 return true;
3287}
3288
3290 return Subtarget.hasFastLZCNT();
3291}
3292
3294 const Instruction &AndI) const {
3295 return true;
3296}
3297
3299 EVT VT = Y.getValueType();
3300
3301 if (VT.isVector())
3302 return false;
3303
3304 if (!Subtarget.hasBMI())
3305 return false;
3306
3307 // There are only 32-bit and 64-bit forms for 'andn'.
3308 if (VT != MVT::i32 && VT != MVT::i64)
3309 return false;
3310
3311 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3312}
3313
3315 EVT VT = Y.getValueType();
3316
3317 if (!VT.isVector())
3318 return hasAndNotCompare(Y);
3319
3320 // Vector.
3321
3322 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3323 return false;
3324
3325 if (VT == MVT::v4i32)
3326 return true;
3327
3328 return Subtarget.hasSSE2();
3329}
3330
3332 return X.getValueType().isScalarInteger(); // 'bt'
3333}
3334
3338 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3339 SelectionDAG &DAG) const {
3340 // Does baseline recommend not to perform the fold by default?
3342 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3343 return false;
3344 // For scalars this transform is always beneficial.
3345 if (X.getValueType().isScalarInteger())
3346 return true;
3347 // If all the shift amounts are identical, then transform is beneficial even
3348 // with rudimentary SSE2 shifts.
3349 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3350 return true;
3351 // If we have AVX2 with it's powerful shift operations, then it's also good.
3352 if (Subtarget.hasAVX2())
3353 return true;
3354 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3355 return NewShiftOpcode == ISD::SHL;
3356}
3357
3359 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3360 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3361 if (!VT.isInteger())
3362 return ShiftOpc;
3363
3364 bool PreferRotate = false;
3365 if (VT.isVector()) {
3366 // For vectors, if we have rotate instruction support, then its definetly
3367 // best. Otherwise its not clear what the best so just don't make changed.
3368 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3369 VT.getScalarType() == MVT::i64);
3370 } else {
3371 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3372 // rotate unless we have a zext mask+shr.
3373 PreferRotate = Subtarget.hasBMI2();
3374 if (!PreferRotate) {
3375 unsigned MaskBits =
3376 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3377 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3378 }
3379 }
3380
3381 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3382 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3383
3384 if (PreferRotate && MayTransformRotate)
3385 return ISD::ROTL;
3386
3387 // If vector we don't really get much benefit swapping around constants.
3388 // Maybe we could check if the DAG has the flipped node already in the
3389 // future.
3390 if (VT.isVector())
3391 return ShiftOpc;
3392
3393 // See if the beneficial to swap shift type.
3394 if (ShiftOpc == ISD::SHL) {
3395 // If the current setup has imm64 mask, then inverse will have
3396 // at least imm32 mask (or be zext i32 -> i64).
3397 if (VT == MVT::i64)
3398 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3399 : ShiftOpc;
3400
3401 // We can only benefit if req at least 7-bit for the mask. We
3402 // don't want to replace shl of 1,2,3 as they can be implemented
3403 // with lea/add.
3404 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3405 }
3406
3407 if (VT == MVT::i64)
3408 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3409 // extremely efficient.
3410 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3411
3412 // Keep small shifts as shl so we can generate add/lea.
3413 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3414 }
3415
3416 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3417 // (PreferRotate will be set in the latter case).
3418 if (PreferRotate || VT.isVector())
3419 return ShiftOpc;
3420
3421 // Non-vector type and we have a zext mask with SRL.
3422 return ISD::SRL;
3423}
3424
3427 const Value *Lhs,
3428 const Value *Rhs) const {
3429 using namespace llvm::PatternMatch;
3430 int BaseCost = BrMergingBaseCostThresh.getValue();
3431 // With CCMP, branches can be merged in a more efficient way.
3432 if (BaseCost >= 0 && Subtarget.hasCCMP())
3433 BaseCost += BrMergingCcmpBias;
3434 // a == b && a == c is a fast pattern on x86.
3436 if (BaseCost >= 0 && Opc == Instruction::And &&
3437 match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3438 Pred == ICmpInst::ICMP_EQ &&
3439 match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3440 Pred == ICmpInst::ICMP_EQ)
3441 BaseCost += 1;
3442 return {BaseCost, BrMergingLikelyBias.getValue(),
3443 BrMergingUnlikelyBias.getValue()};
3444}
3445
3447 return N->getOpcode() != ISD::FP_EXTEND;
3448}
3449
3451 const SDNode *N, CombineLevel Level) const {
3452 assert(((N->getOpcode() == ISD::SHL &&
3453 N->getOperand(0).getOpcode() == ISD::SRL) ||
3454 (N->getOpcode() == ISD::SRL &&
3455 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3456 "Expected shift-shift mask");
3457 // TODO: Should we always create i64 masks? Or only folded immediates?
3458 EVT VT = N->getValueType(0);
3459 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3460 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3461 // Only fold if the shift values are equal - so it folds to AND.
3462 // TODO - we should fold if either is a non-uniform vector but we don't do
3463 // the fold for non-splats yet.
3464 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3465 }
3467}
3468
3470 EVT VT = Y.getValueType();
3471
3472 // For vectors, we don't have a preference, but we probably want a mask.
3473 if (VT.isVector())
3474 return false;
3475
3476 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3477 if (VT == MVT::i64 && !Subtarget.is64Bit())
3478 return false;
3479
3480 return true;
3481}
3482
3485 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3487 !Subtarget.isOSWindows())
3490 ExpansionFactor);
3491}
3492
3494 // Any legal vector type can be splatted more efficiently than
3495 // loading/spilling from memory.
3496 return isTypeLegal(VT);
3497}
3498
3500 MVT VT = MVT::getIntegerVT(NumBits);
3501 if (isTypeLegal(VT))
3502 return VT;
3503
3504 // PMOVMSKB can handle this.
3505 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3506 return MVT::v16i8;
3507
3508 // VPMOVMSKB can handle this.
3509 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3510 return MVT::v32i8;
3511
3512 // TODO: Allow 64-bit type for 32-bit target.
3513 // TODO: 512-bit types should be allowed, but make sure that those
3514 // cases are handled in combineVectorSizedSetCCEquality().
3515
3517}
3518
3519/// Val is the undef sentinel value or equal to the specified value.
3520static bool isUndefOrEqual(int Val, int CmpVal) {
3521 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3522}
3523
3524/// Return true if every element in Mask is the undef sentinel value or equal to
3525/// the specified value.
3526static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3527 return llvm::all_of(Mask, [CmpVal](int M) {
3528 return (M == SM_SentinelUndef) || (M == CmpVal);
3529 });
3530}
3531
3532/// Return true if every element in Mask, beginning from position Pos and ending
3533/// in Pos+Size is the undef sentinel value or equal to the specified value.
3534static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3535 unsigned Size) {
3536 return llvm::all_of(Mask.slice(Pos, Size),
3537 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3538}
3539
3540/// Val is either the undef or zero sentinel value.
3541static bool isUndefOrZero(int Val) {
3542 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3543}
3544
3545/// Return true if every element in Mask, beginning from position Pos and ending
3546/// in Pos+Size is the undef sentinel value.
3547static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3548 return llvm::all_of(Mask.slice(Pos, Size),
3549 [](int M) { return M == SM_SentinelUndef; });
3550}
3551
3552/// Return true if the mask creates a vector whose lower half is undefined.
3554 unsigned NumElts = Mask.size();
3555 return isUndefInRange(Mask, 0, NumElts / 2);
3556}
3557
3558/// Return true if the mask creates a vector whose upper half is undefined.
3560 unsigned NumElts = Mask.size();
3561 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3562}
3563
3564/// Return true if Val falls within the specified range (L, H].
3565static bool isInRange(int Val, int Low, int Hi) {
3566 return (Val >= Low && Val < Hi);
3567}
3568
3569/// Return true if the value of any element in Mask falls within the specified
3570/// range (L, H].
3571static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3572 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3573}
3574
3575/// Return true if the value of any element in Mask is the zero sentinel value.
3576static bool isAnyZero(ArrayRef<int> Mask) {
3577 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3578}
3579
3580/// Return true if Val is undef or if its value falls within the
3581/// specified range (L, H].
3582static bool isUndefOrInRange(int Val, int Low, int Hi) {
3583 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3584}
3585
3586/// Return true if every element in Mask is undef or if its value
3587/// falls within the specified range (L, H].
3588static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3589 return llvm::all_of(
3590 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3591}
3592
3593/// Return true if Val is undef, zero or if its value falls within the
3594/// specified range (L, H].
3595static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3596 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3597}
3598
3599/// Return true if every element in Mask is undef, zero or if its value
3600/// falls within the specified range (L, H].
3601static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3602 return llvm::all_of(
3603 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3604}
3605
3606/// Return true if every element in Mask, is an in-place blend/select mask or is
3607/// undef.
3609 unsigned NumElts = Mask.size();
3610 for (auto [I, M] : enumerate(Mask))
3611 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3612 return false;
3613 return true;
3614}
3615
3616/// Return true if every element in Mask, beginning
3617/// from position Pos and ending in Pos + Size, falls within the specified
3618/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3619static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3620 unsigned Size, int Low, int Step = 1) {
3621 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3622 if (!isUndefOrEqual(Mask[i], Low))
3623 return false;
3624 return true;
3625}
3626
3627/// Return true if every element in Mask, beginning
3628/// from position Pos and ending in Pos+Size, falls within the specified
3629/// sequential range (Low, Low+Size], or is undef or is zero.
3631 unsigned Size, int Low,
3632 int Step = 1) {
3633 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3634 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3635 return false;
3636 return true;
3637}
3638
3639/// Return true if every element in Mask, beginning
3640/// from position Pos and ending in Pos+Size is undef or is zero.
3641static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3642 unsigned Size) {
3643 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3644}
3645
3646/// Return true if every element of a single input is referenced by the shuffle
3647/// mask. i.e. it just permutes them all.
3649 unsigned NumElts = Mask.size();
3650 APInt DemandedElts = APInt::getZero(NumElts);
3651 for (int M : Mask)
3652 if (isInRange(M, 0, NumElts))
3653 DemandedElts.setBit(M);
3654 return DemandedElts.isAllOnes();
3655}
3656
3657/// Helper function to test whether a shuffle mask could be
3658/// simplified by widening the elements being shuffled.
3659///
3660/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3661/// leaves it in an unspecified state.
3662///
3663/// NOTE: This must handle normal vector shuffle masks and *target* vector
3664/// shuffle masks. The latter have the special property of a '-2' representing
3665/// a zero-ed lane of a vector.
3667 SmallVectorImpl<int> &WidenedMask) {
3668 WidenedMask.assign(Mask.size() / 2, 0);
3669 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3670 int M0 = Mask[i];
3671 int M1 = Mask[i + 1];
3672
3673 // If both elements are undef, its trivial.
3674 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3675 WidenedMask[i / 2] = SM_SentinelUndef;
3676 continue;
3677 }
3678
3679 // Check for an undef mask and a mask value properly aligned to fit with
3680 // a pair of values. If we find such a case, use the non-undef mask's value.
3681 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3682 WidenedMask[i / 2] = M1 / 2;
3683 continue;
3684 }
3685 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3686 WidenedMask[i / 2] = M0 / 2;
3687 continue;
3688 }
3689
3690 // When zeroing, we need to spread the zeroing across both lanes to widen.
3691 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3692 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3694 WidenedMask[i / 2] = SM_SentinelZero;
3695 continue;
3696 }
3697 return false;
3698 }
3699
3700 // Finally check if the two mask values are adjacent and aligned with
3701 // a pair.
3702 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3703 WidenedMask[i / 2] = M0 / 2;
3704 continue;
3705 }
3706
3707 // Otherwise we can't safely widen the elements used in this shuffle.
3708 return false;
3709 }
3710 assert(WidenedMask.size() == Mask.size() / 2 &&
3711 "Incorrect size of mask after widening the elements!");
3712
3713 return true;
3714}
3715
3717 const APInt &Zeroable,
3718 bool V2IsZero,
3719 SmallVectorImpl<int> &WidenedMask) {
3720 // Create an alternative mask with info about zeroable elements.
3721 // Here we do not set undef elements as zeroable.
3722 SmallVector<int, 64> ZeroableMask(Mask);
3723 if (V2IsZero) {
3724 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3725 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3726 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3727 ZeroableMask[i] = SM_SentinelZero;
3728 }
3729 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3730}
3731
3733 SmallVector<int, 32> WidenedMask;
3734 return canWidenShuffleElements(Mask, WidenedMask);
3735}
3736
3737// Attempt to narrow/widen shuffle mask until it matches the target number of
3738// elements.
3739static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3740 SmallVectorImpl<int> &ScaledMask) {
3741 unsigned NumSrcElts = Mask.size();
3742 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3743 "Illegal shuffle scale factor");
3744
3745 // Narrowing is guaranteed to work.
3746 if (NumDstElts >= NumSrcElts) {
3747 int Scale = NumDstElts / NumSrcElts;
3748 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3749 return true;
3750 }
3751
3752 // We have to repeat the widening until we reach the target size, but we can
3753 // split out the first widening as it sets up ScaledMask for us.
3754 if (canWidenShuffleElements(Mask, ScaledMask)) {
3755 while (ScaledMask.size() > NumDstElts) {
3756 SmallVector<int, 16> WidenedMask;
3757 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3758 return false;
3759 ScaledMask = std::move(WidenedMask);
3760 }
3761 return true;
3762 }
3763
3764 return false;
3765}
3766
3767static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3768 SmallVector<int, 32> ScaledMask;
3769 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3770}
3771
3772/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3774 return isNullConstant(Elt) || isNullFPConstant(Elt);
3775}
3776
3777// Build a vector of constants.
3778// Use an UNDEF node if MaskElt == -1.
3779// Split 64-bit constants in the 32-bit mode.
3781 const SDLoc &dl, bool IsMask = false) {
3782
3784 bool Split = false;
3785
3786 MVT ConstVecVT = VT;
3787 unsigned NumElts = VT.getVectorNumElements();
3788 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3789 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3790 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3791 Split = true;
3792 }
3793
3794 MVT EltVT = ConstVecVT.getVectorElementType();
3795 for (unsigned i = 0; i < NumElts; ++i) {
3796 bool IsUndef = Values[i] < 0 && IsMask;
3797 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3798 DAG.getConstant(Values[i], dl, EltVT);
3799 Ops.push_back(OpNode);
3800 if (Split)
3801 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3802 DAG.getConstant(0, dl, EltVT));
3803 }
3804 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3805 if (Split)
3806 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3807 return ConstsNode;
3808}
3809
3810static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3811 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3812 assert(Bits.size() == Undefs.getBitWidth() &&
3813 "Unequal constant and undef arrays");
3815 bool Split = false;
3816
3817 MVT ConstVecVT = VT;
3818 unsigned NumElts = VT.getVectorNumElements();
3819 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3820 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3821 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3822 Split = true;
3823 }
3824
3825 MVT EltVT = ConstVecVT.getVectorElementType();
3826 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3827 if (Undefs[i]) {
3828 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3829 continue;
3830 }
3831 const APInt &V = Bits[i];
3832 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3833 if (Split) {
3834 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3835 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3836 } else if (EltVT == MVT::f32) {
3838 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3839 } else if (EltVT == MVT::f64) {
3841 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3842 } else {
3843 Ops.push_back(DAG.getConstant(V, dl, EltVT));
3844 }
3845 }
3846
3847 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3848 return DAG.getBitcast(VT, ConstsNode);
3849}
3850
3852 SelectionDAG &DAG, const SDLoc &dl) {
3853 APInt Undefs = APInt::getZero(Bits.size());
3854 return getConstVector(Bits, Undefs, VT, DAG, dl);
3855}
3856
3857/// Returns a vector of specified type with all zero elements.
3858static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
3859 SelectionDAG &DAG, const SDLoc &dl) {
3860 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
3861 VT.getVectorElementType() == MVT::i1) &&
3862 "Unexpected vector type");
3863
3864 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
3865 // type. This ensures they get CSE'd. But if the integer type is not
3866 // available, use a floating-point +0.0 instead.
3867 SDValue Vec;
3868 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3869 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
3870 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
3871 } else if (VT.isFloatingPoint() &&
3873 Vec = DAG.getConstantFP(+0.0, dl, VT);
3874 } else if (VT.getVectorElementType() == MVT::i1) {
3875 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
3876 "Unexpected vector type");
3877 Vec = DAG.getConstant(0, dl, VT);
3878 } else {
3879 unsigned Num32BitElts = VT.getSizeInBits() / 32;
3880 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
3881 }
3882 return DAG.getBitcast(VT, Vec);
3883}
3884
3885// Helper to determine if the ops are all the extracted subvectors come from a
3886// single source. If we allow commute they don't have to be in order (Lo/Hi).
3887static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
3888 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3889 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3890 LHS.getValueType() != RHS.getValueType() ||
3891 LHS.getOperand(0) != RHS.getOperand(0))
3892 return SDValue();
3893
3894 SDValue Src = LHS.getOperand(0);
3895 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
3896 return SDValue();
3897
3898 unsigned NumElts = LHS.getValueType().getVectorNumElements();
3899 if ((LHS.getConstantOperandAPInt(1) == 0 &&
3900 RHS.getConstantOperandAPInt(1) == NumElts) ||
3901 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
3902 LHS.getConstantOperandAPInt(1) == NumElts))
3903 return Src;
3904
3905 return SDValue();
3906}
3907
3908static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
3909 const SDLoc &dl, unsigned vectorWidth) {
3910 EVT VT = Vec.getValueType();
3911 EVT ElVT = VT.getVectorElementType();
3912 unsigned Factor = VT.getSizeInBits() / vectorWidth;
3913 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
3914 VT.getVectorNumElements() / Factor);
3915
3916 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
3917 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
3918 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3919
3920 // This is the index of the first element of the vectorWidth-bit chunk
3921 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3922 IdxVal &= ~(ElemsPerChunk - 1);
3923
3924 // If the input is a buildvector just emit a smaller one.
3925 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
3926 return DAG.getBuildVector(ResultVT, dl,
3927 Vec->ops().slice(IdxVal, ElemsPerChunk));
3928
3929 // Check if we're extracting the upper undef of a widening pattern.
3930 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
3931 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
3932 isNullConstant(Vec.getOperand(2)))
3933 return DAG.getUNDEF(ResultVT);
3934
3935 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3936 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
3937}
3938
3939/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
3940/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
3941/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
3942/// instructions or a simple subregister reference. Idx is an index in the
3943/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
3944/// lowering EXTRACT_VECTOR_ELT operations easier.
3945static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
3946 SelectionDAG &DAG, const SDLoc &dl) {
3948 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
3949 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
3950}
3951
3952/// Generate a DAG to grab 256-bits from a 512-bit vector.
3953static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
3954 SelectionDAG &DAG, const SDLoc &dl) {
3955 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
3956 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
3957}
3958
3959static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3960 SelectionDAG &DAG, const SDLoc &dl,
3961 unsigned vectorWidth) {
3962 assert((vectorWidth == 128 || vectorWidth == 256) &&
3963 "Unsupported vector width");
3964 // Inserting UNDEF is Result
3965 if (Vec.isUndef())
3966 return Result;
3967 EVT VT = Vec.getValueType();
3968 EVT ElVT = VT.getVectorElementType();
3969 EVT ResultVT = Result.getValueType();
3970
3971 // Insert the relevant vectorWidth bits.
3972 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
3973 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3974
3975 // This is the index of the first element of the vectorWidth-bit chunk
3976 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3977 IdxVal &= ~(ElemsPerChunk - 1);
3978
3979 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3980 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
3981}
3982
3983/// Generate a DAG to put 128-bits into a vector > 128 bits. This
3984/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
3985/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
3986/// simple superregister reference. Idx is an index in the 128 bits
3987/// we want. It need not be aligned to a 128-bit boundary. That makes
3988/// lowering INSERT_VECTOR_ELT operations easier.
3989static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3990 SelectionDAG &DAG, const SDLoc &dl) {
3991 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
3992 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
3993}
3994
3995/// Widen a vector to a larger size with the same scalar type, with the new
3996/// elements either zero or undef.
3997static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
3998 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3999 const SDLoc &dl) {
4001 Vec.getValueType().getScalarType() == VT.getScalarType() &&
4002 "Unsupported vector widening type");
4003 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4004 : DAG.getUNDEF(VT);
4005 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
4006 DAG.getIntPtrConstant(0, dl));
4007}
4008
4009/// Widen a vector to a larger size with the same scalar type, with the new
4010/// elements either zero or undef.
4011static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4012 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4013 const SDLoc &dl, unsigned WideSizeInBits) {
4014 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4015 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4016 "Unsupported vector widening type");
4017 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4018 MVT SVT = Vec.getSimpleValueType().getScalarType();
4019 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4020 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4021}
4022
4023/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4024/// and bitcast with integer types.
4025static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4026 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4027 unsigned NumElts = VT.getVectorNumElements();
4028 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4029 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4030 return VT;
4031}
4032
4033/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4034/// bitcast with integer types.
4035static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4036 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4037 const SDLoc &dl) {
4038 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4039 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4040}
4041
4042// Helper function to collect subvector ops that are concatenated together,
4043// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4044// The subvectors in Ops are guaranteed to be the same type.
4046 SelectionDAG &DAG) {
4047 assert(Ops.empty() && "Expected an empty ops vector");
4048
4049 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4050 Ops.append(N->op_begin(), N->op_end());
4051 return true;
4052 }
4053
4054 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4055 SDValue Src = N->getOperand(0);
4056 SDValue Sub = N->getOperand(1);
4057 const APInt &Idx = N->getConstantOperandAPInt(2);
4058 EVT VT = Src.getValueType();
4059 EVT SubVT = Sub.getValueType();
4060
4061 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4062 // insert_subvector(undef, x, lo)
4063 if (Idx == 0 && Src.isUndef()) {
4064 Ops.push_back(Sub);
4065 Ops.push_back(DAG.getUNDEF(SubVT));
4066 return true;
4067 }
4068 if (Idx == (VT.getVectorNumElements() / 2)) {
4069 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4070 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4071 Src.getOperand(1).getValueType() == SubVT &&
4072 isNullConstant(Src.getOperand(2))) {
4073 // Attempt to recurse into inner (matching) concats.
4074 SDValue Lo = Src.getOperand(1);
4075 SDValue Hi = Sub;
4076 SmallVector<SDValue, 2> LoOps, HiOps;
4077 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4078 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4079 LoOps.size() == HiOps.size()) {
4080 Ops.append(LoOps);
4081 Ops.append(HiOps);
4082 return true;
4083 }
4084 Ops.push_back(Lo);
4085 Ops.push_back(Hi);
4086 return true;
4087 }
4088 // insert_subvector(x, extract_subvector(x, lo), hi)
4089 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4090 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4091 Ops.append(2, Sub);
4092 return true;
4093 }
4094 // insert_subvector(undef, x, hi)
4095 if (Src.isUndef()) {
4096 Ops.push_back(DAG.getUNDEF(SubVT));
4097 Ops.push_back(Sub);
4098 return true;
4099 }
4100 }
4101 }
4102 }
4103
4104 return false;
4105}
4106
4107// Helper to check if \p V can be split into subvectors and the upper subvectors
4108// are all undef. In which case return the lower subvector.
4110 SelectionDAG &DAG) {
4111 SmallVector<SDValue> SubOps;
4112 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4113 return SDValue();
4114
4115 unsigned NumSubOps = SubOps.size();
4116 unsigned HalfNumSubOps = NumSubOps / 2;
4117 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4118
4119 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4120 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4121 return SDValue();
4122
4123 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4124 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4125 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4126}
4127
4128// Helper to check if we can access all the constituent subvectors without any
4129// extract ops.
4132 return collectConcatOps(N, Ops, DAG);
4133}
4134
4135static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4136 const SDLoc &dl) {
4137 EVT VT = Op.getValueType();
4138 unsigned NumElems = VT.getVectorNumElements();
4139 unsigned SizeInBits = VT.getSizeInBits();
4140 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4141 "Can't split odd sized vector");
4142
4143 // If this is a splat value (with no-undefs) then use the lower subvector,
4144 // which should be a free extraction.
4145 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4146 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4147 return std::make_pair(Lo, Lo);
4148
4149 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4150 return std::make_pair(Lo, Hi);
4151}
4152
4153/// Break an operation into 2 half sized ops and then concatenate the results.
4155 unsigned NumOps = Op.getNumOperands();
4156 EVT VT = Op.getValueType();
4157
4158 // Extract the LHS Lo/Hi vectors
4159 SmallVector<SDValue> LoOps(NumOps, SDValue());
4160 SmallVector<SDValue> HiOps(NumOps, SDValue());
4161 for (unsigned I = 0; I != NumOps; ++I) {
4162 SDValue SrcOp = Op.getOperand(I);
4163 if (!SrcOp.getValueType().isVector()) {
4164 LoOps[I] = HiOps[I] = SrcOp;
4165 continue;
4166 }
4167 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4168 }
4169
4170 EVT LoVT, HiVT;
4171 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4172 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4173 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4174 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4175}
4176
4177/// Break an unary integer operation into 2 half sized ops and then
4178/// concatenate the result back.
4180 const SDLoc &dl) {
4181 // Make sure we only try to split 256/512-bit types to avoid creating
4182 // narrow vectors.
4183 [[maybe_unused]] EVT VT = Op.getValueType();
4184 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4185 Op.getOperand(0).getValueType().is512BitVector()) &&
4186 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4187 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4188 VT.getVectorNumElements() &&
4189 "Unexpected VTs!");
4190 return splitVectorOp(Op, DAG, dl);
4191}
4192
4193/// Break a binary integer operation into 2 half sized ops and then
4194/// concatenate the result back.
4196 const SDLoc &dl) {
4197 // Assert that all the types match.
4198 [[maybe_unused]] EVT VT = Op.getValueType();
4199 assert(Op.getOperand(0).getValueType() == VT &&
4200 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4201 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4202 return splitVectorOp(Op, DAG, dl);
4203}
4204
4205// Helper for splitting operands of an operation to legal target size and
4206// apply a function on each part.
4207// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4208// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4209// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4210// The argument Builder is a function that will be applied on each split part:
4211// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4212template <typename F>
4214 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4215 F Builder, bool CheckBWI = true) {
4216 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4217 unsigned NumSubs = 1;
4218 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4219 (!CheckBWI && Subtarget.useAVX512Regs())) {
4220 if (VT.getSizeInBits() > 512) {
4221 NumSubs = VT.getSizeInBits() / 512;
4222 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4223 }
4224 } else if (Subtarget.hasAVX2()) {
4225 if (VT.getSizeInBits() > 256) {
4226 NumSubs = VT.getSizeInBits() / 256;
4227 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4228 }
4229 } else {
4230 if (VT.getSizeInBits() > 128) {
4231 NumSubs = VT.getSizeInBits() / 128;
4232 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4233 }
4234 }
4235
4236 if (NumSubs == 1)
4237 return Builder(DAG, DL, Ops);
4238
4240 for (unsigned i = 0; i != NumSubs; ++i) {
4242 for (SDValue Op : Ops) {
4243 EVT OpVT = Op.getValueType();
4244 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4245 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4246 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4247 }
4248 Subs.push_back(Builder(DAG, DL, SubOps));
4249 }
4250 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4251}
4252
4253// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4254// targets.
4255static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4257 const X86Subtarget &Subtarget) {
4258 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4259 MVT SVT = VT.getScalarType();
4260
4261 // If we have a 32/64 splatted constant, splat it to DstTy to
4262 // encourage a foldable broadcast'd operand.
4263 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4264 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4265 // AVX512 broadcasts 32/64-bit operands.
4266 // TODO: Support float once getAVX512Node is used by fp-ops.
4267 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4269 return SDValue();
4270 // If we're not widening, don't bother if we're not bitcasting.
4271 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4272 return SDValue();
4273 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4274 APInt SplatValue, SplatUndef;
4275 unsigned SplatBitSize;
4276 bool HasAnyUndefs;
4277 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4278 HasAnyUndefs, OpEltSizeInBits) &&
4279 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4280 return DAG.getConstant(SplatValue, DL, DstVT);
4281 }
4282 return SDValue();
4283 };
4284
4285 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4286
4287 MVT DstVT = VT;
4288 if (Widen)
4289 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4290
4291 // Canonicalize src operands.
4292 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
4293 for (SDValue &Op : SrcOps) {
4294 MVT OpVT = Op.getSimpleValueType();
4295 // Just pass through scalar operands.
4296 if (!OpVT.isVector())
4297 continue;
4298 assert(OpVT == VT && "Vector type mismatch");
4299
4300 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4301 Op = BroadcastOp;
4302 continue;
4303 }
4304
4305 // Just widen the subvector by inserting into an undef wide vector.
4306 if (Widen)
4307 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4308 }
4309
4310 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4311
4312 // Perform the 512-bit op then extract the bottom subvector.
4313 if (Widen)
4314 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4315 return Res;
4316}
4317
4318/// Insert i1-subvector to i1-vector.
4320 const X86Subtarget &Subtarget) {
4321
4322 SDLoc dl(Op);
4323 SDValue Vec = Op.getOperand(0);
4324 SDValue SubVec = Op.getOperand(1);
4325 SDValue Idx = Op.getOperand(2);
4326 unsigned IdxVal = Op.getConstantOperandVal(2);
4327
4328 // Inserting undef is a nop. We can just return the original vector.
4329 if (SubVec.isUndef())
4330 return Vec;
4331
4332 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4333 return Op;
4334
4335 MVT OpVT = Op.getSimpleValueType();
4336 unsigned NumElems = OpVT.getVectorNumElements();
4337 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4338
4339 // Extend to natively supported kshift.
4340 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4341
4342 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4343 // if necessary.
4344 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4345 // May need to promote to a legal type.
4346 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4347 DAG.getConstant(0, dl, WideOpVT),
4348 SubVec, Idx);
4349 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4350 }
4351
4352 MVT SubVecVT = SubVec.getSimpleValueType();
4353 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4354 assert(IdxVal + SubVecNumElems <= NumElems &&
4355 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4356 "Unexpected index value in INSERT_SUBVECTOR");
4357
4358 SDValue Undef = DAG.getUNDEF(WideOpVT);
4359
4360 if (IdxVal == 0) {
4361 // Zero lower bits of the Vec
4362 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4363 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4364 ZeroIdx);
4365 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4366 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4367 // Merge them together, SubVec should be zero extended.
4368 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4369 DAG.getConstant(0, dl, WideOpVT),
4370 SubVec, ZeroIdx);
4371 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4372 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4373 }
4374
4375 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4376 Undef, SubVec, ZeroIdx);
4377
4378 if (Vec.isUndef()) {
4379 assert(IdxVal != 0 && "Unexpected index");
4380 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4381 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4382 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4383 }
4384
4386 assert(IdxVal != 0 && "Unexpected index");
4387 // If upper elements of Vec are known undef, then just shift into place.
4388 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4389 [](SDValue V) { return V.isUndef(); })) {
4390 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4391 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4392 } else {
4393 NumElems = WideOpVT.getVectorNumElements();
4394 unsigned ShiftLeft = NumElems - SubVecNumElems;
4395 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4396 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4397 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4398 if (ShiftRight != 0)
4399 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4400 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4401 }
4402 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4403 }
4404
4405 // Simple case when we put subvector in the upper part
4406 if (IdxVal + SubVecNumElems == NumElems) {
4407 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4408 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4409 if (SubVecNumElems * 2 == NumElems) {
4410 // Special case, use legal zero extending insert_subvector. This allows
4411 // isel to optimize when bits are known zero.
4412 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4413 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4414 DAG.getConstant(0, dl, WideOpVT),
4415 Vec, ZeroIdx);
4416 } else {
4417 // Otherwise use explicit shifts to zero the bits.
4418 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4419 Undef, Vec, ZeroIdx);
4420 NumElems = WideOpVT.getVectorNumElements();
4421 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4422 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4423 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4424 }
4425 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4426 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4427 }
4428
4429 // Inserting into the middle is more complicated.
4430
4431 NumElems = WideOpVT.getVectorNumElements();
4432
4433 // Widen the vector if needed.
4434 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4435
4436 unsigned ShiftLeft = NumElems - SubVecNumElems;
4437 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4438
4439 // Do an optimization for the most frequently used types.
4440 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4441 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4442 Mask0.flipAllBits();
4443 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4444 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4445 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4446 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4447 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4448 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4449 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4450 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4451
4452 // Reduce to original width if needed.
4453 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4454 }
4455
4456 // Clear the upper bits of the subvector and move it to its insert position.
4457 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4458 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4459 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4460 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4461
4462 // Isolate the bits below the insertion point.
4463 unsigned LowShift = NumElems - IdxVal;
4464 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4465 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4466 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4467 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4468
4469 // Isolate the bits after the last inserted bit.
4470 unsigned HighShift = IdxVal + SubVecNumElems;
4471 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4472 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4473 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4474 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4475
4476 // Now OR all 3 pieces together.
4477 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4478 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4479
4480 // Reduce to original width if needed.
4481 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4482}
4483
4485 const SDLoc &dl) {
4486 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4487 EVT SubVT = V1.getValueType();
4488 EVT SubSVT = SubVT.getScalarType();
4489 unsigned SubNumElts = SubVT.getVectorNumElements();
4490 unsigned SubVectorWidth = SubVT.getSizeInBits();
4491 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4492 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4493 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4494}
4495
4496/// Returns a vector of specified type with all bits set.
4497/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4498/// Then bitcast to their original type, ensuring they get CSE'd.
4499static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4500 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4501 "Expected a 128/256/512-bit vector type");
4502 unsigned NumElts = VT.getSizeInBits() / 32;
4503 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4504 return DAG.getBitcast(VT, Vec);
4505}
4506
4507static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4508 SDValue In, SelectionDAG &DAG) {
4509 EVT InVT = In.getValueType();
4510 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4511 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4512 ISD::ZERO_EXTEND == Opcode) &&
4513 "Unknown extension opcode");
4514
4515 // For 256-bit vectors, we only need the lower (128-bit) input half.
4516 // For 512-bit vectors, we only need the lower input half or quarter.
4517 if (InVT.getSizeInBits() > 128) {
4518 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4519 "Expected VTs to be the same size!");
4520 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4521 In = extractSubVector(In, 0, DAG, DL,
4522 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4523 InVT = In.getValueType();
4524 }
4525
4526 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4527 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4528
4529 return DAG.getNode(Opcode, DL, VT, In);
4530}
4531
4532// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4533static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4534 SDValue Mask, SelectionDAG &DAG) {
4535 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4536 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4537 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4538}
4539
4541 bool Lo, bool Unary) {
4542 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4543 "Illegal vector type to unpack");
4544 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4545 int NumElts = VT.getVectorNumElements();
4546 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4547 for (int i = 0; i < NumElts; ++i) {
4548 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4549 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4550 Pos += (Unary ? 0 : NumElts * (i % 2));
4551 Pos += (Lo ? 0 : NumEltsInLane / 2);
4552 Mask.push_back(Pos);
4553 }
4554}
4555
4556/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4557/// imposed by AVX and specific to the unary pattern. Example:
4558/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4559/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4561 bool Lo) {
4562 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4563 int NumElts = VT.getVectorNumElements();
4564 for (int i = 0; i < NumElts; ++i) {
4565 int Pos = i / 2;
4566 Pos += (Lo ? 0 : NumElts / 2);
4567 Mask.push_back(Pos);
4568 }
4569}
4570
4571// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4572static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4573 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4575 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4576 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4577 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4578 int M = Mask[I];
4579 if (M < 0)
4580 continue;
4581 SDValue V = (M < NumElts) ? V1 : V2;
4582 if (V.isUndef())
4583 continue;
4584 Ops[I] = V.getOperand(M % NumElts);
4585 }
4586 return DAG.getBuildVector(VT, dl, Ops);
4587 }
4588
4589 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4590}
4591
4592/// Returns a vector_shuffle node for an unpackl operation.
4593static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4594 SDValue V1, SDValue V2) {
4596 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4597 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4598}
4599
4600/// Returns a vector_shuffle node for an unpackh operation.
4601static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4602 SDValue V1, SDValue V2) {
4604 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4605 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4606}
4607
4608/// Returns a node that packs the LHS + RHS nodes together at half width.
4609/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4610/// TODO: Add subvector splitting if/when we have a need for it.
4611static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4612 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4613 bool PackHiHalf = false) {
4614 MVT OpVT = LHS.getSimpleValueType();
4615 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4616 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4617 assert(OpVT == RHS.getSimpleValueType() &&
4618 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4619 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4620 "Unexpected PACK operand types");
4621 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4622 "Unexpected PACK result type");
4623
4624 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4625 if (EltSizeInBits == 32) {
4626 SmallVector<int> PackMask;
4627 int Offset = PackHiHalf ? 1 : 0;
4628 int NumElts = VT.getVectorNumElements();
4629 for (int I = 0; I != NumElts; I += 4) {
4630 PackMask.push_back(I + Offset);
4631 PackMask.push_back(I + Offset + 2);
4632 PackMask.push_back(I + Offset + NumElts);
4633 PackMask.push_back(I + Offset + NumElts + 2);
4634 }
4635 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4636 DAG.getBitcast(VT, RHS), PackMask);
4637 }
4638
4639 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4640 if (!PackHiHalf) {
4641 if (UsePackUS &&
4642 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4643 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4644 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4645
4646 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4647 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4648 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4649 }
4650
4651 // Fallback to sign/zero extending the requested half and pack.
4652 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4653 if (UsePackUS) {
4654 if (PackHiHalf) {
4655 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4656 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4657 } else {
4658 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4659 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4660 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4661 };
4662 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4663 };
4664
4665 if (!PackHiHalf) {
4666 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4667 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4668 }
4669 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4670 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4671 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4672}
4673
4674/// Return a vector_shuffle of the specified vector of zero or undef vector.
4675/// This produces a shuffle where the low element of V2 is swizzled into the
4676/// zero/undef vector, landing at element Idx.
4677/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4679 bool IsZero,
4680 const X86Subtarget &Subtarget,
4681 SelectionDAG &DAG) {
4682 MVT VT = V2.getSimpleValueType();
4683 SDValue V1 = IsZero
4684 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4685 int NumElems = VT.getVectorNumElements();
4686 SmallVector<int, 16> MaskVec(NumElems);
4687 for (int i = 0; i != NumElems; ++i)
4688 // If this is the insertion idx, put the low elt of V2 here.
4689 MaskVec[i] = (i == Idx) ? NumElems : i;
4690 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4691}
4692
4694 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4695 Ptr.getOpcode() == X86ISD::WrapperRIP)
4696 Ptr = Ptr.getOperand(0);
4697 return dyn_cast<ConstantPoolSDNode>(Ptr);
4698}
4699
4700// TODO: Add support for non-zero offsets.
4703 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4704 return nullptr;
4705 return CNode->getConstVal();
4706}
4707
4709 if (!Load || !ISD::isNormalLoad(Load))
4710 return nullptr;
4711 return getTargetConstantFromBasePtr(Load->getBasePtr());
4712}
4713
4716 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4717}
4718
4719const Constant *
4721 assert(LD && "Unexpected null LoadSDNode");
4722 return getTargetConstantFromNode(LD);
4723}
4724
4725// Extract raw constant bits from constant pools.
4726static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4727 APInt &UndefElts,
4728 SmallVectorImpl<APInt> &EltBits,
4729 bool AllowWholeUndefs = true,
4730 bool AllowPartialUndefs = false) {
4731 assert(EltBits.empty() && "Expected an empty EltBits vector");
4732
4734
4735 EVT VT = Op.getValueType();
4736 unsigned SizeInBits = VT.getSizeInBits();
4737 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4738 unsigned NumElts = SizeInBits / EltSizeInBits;
4739
4740 // Bitcast a source array of element bits to the target size.
4741 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4742 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4743 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4744 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4745 "Constant bit sizes don't match");
4746
4747 // Don't split if we don't allow undef bits.
4748 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4749 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4750 return false;
4751
4752 // If we're already the right size, don't bother bitcasting.
4753 if (NumSrcElts == NumElts) {
4754 UndefElts = UndefSrcElts;
4755 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4756 return true;
4757 }
4758
4759 // Extract all the undef/constant element data and pack into single bitsets.
4760 APInt UndefBits(SizeInBits, 0);
4761 APInt MaskBits(SizeInBits, 0);
4762
4763 for (unsigned i = 0; i != NumSrcElts; ++i) {
4764 unsigned BitOffset = i * SrcEltSizeInBits;
4765 if (UndefSrcElts[i])
4766 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4767 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4768 }
4769
4770 // Split the undef/constant single bitset data into the target elements.
4771 UndefElts = APInt(NumElts, 0);
4772 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4773
4774 for (unsigned i = 0; i != NumElts; ++i) {
4775 unsigned BitOffset = i * EltSizeInBits;
4776 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4777
4778 // Only treat an element as UNDEF if all bits are UNDEF.
4779 if (UndefEltBits.isAllOnes()) {
4780 if (!AllowWholeUndefs)
4781 return false;
4782 UndefElts.setBit(i);
4783 continue;
4784 }
4785
4786 // If only some bits are UNDEF then treat them as zero (or bail if not
4787 // supported).
4788 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4789 return false;
4790
4791 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4792 }
4793 return true;
4794 };
4795
4796 // Collect constant bits and insert into mask/undef bit masks.
4797 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4798 unsigned UndefBitIndex) {
4799 if (!Cst)
4800 return false;
4801 if (isa<UndefValue>(Cst)) {
4802 Undefs.setBit(UndefBitIndex);
4803 return true;
4804 }
4805 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4806 Mask = CInt->getValue();
4807 return true;
4808 }
4809 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4810 Mask = CFP->getValueAPF().bitcastToAPInt();
4811 return true;
4812 }
4813 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4814 Type *Ty = CDS->getType();
4816 Type *EltTy = CDS->getElementType();
4817 bool IsInteger = EltTy->isIntegerTy();
4818 bool IsFP =
4819 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4820 if (!IsInteger && !IsFP)
4821 return false;
4822 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4823 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4824 if (IsInteger)
4825 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4826 else
4827 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4828 I * EltBits);
4829 return true;
4830 }
4831 return false;
4832 };
4833
4834 // Handle UNDEFs.
4835 if (Op.isUndef()) {
4836 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
4837 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
4838 return CastBitData(UndefSrcElts, SrcEltBits);
4839 }
4840
4841 // Extract scalar constant bits.
4842 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
4843 APInt UndefSrcElts = APInt::getZero(1);
4844 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
4845 return CastBitData(UndefSrcElts, SrcEltBits);
4846 }
4847 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
4848 APInt UndefSrcElts = APInt::getZero(1);
4849 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
4850 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
4851 return CastBitData(UndefSrcElts, SrcEltBits);
4852 }
4853
4854 // Extract constant bits from build vector.
4855 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
4856 BitVector Undefs;
4857 SmallVector<APInt> SrcEltBits;
4858 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4859 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
4860 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
4861 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
4862 if (Undefs[I])
4863 UndefSrcElts.setBit(I);
4864 return CastBitData(UndefSrcElts, SrcEltBits);
4865 }
4866 }
4867
4868 // Extract constant bits from constant pool vector.
4869 if (auto *Cst = getTargetConstantFromNode(Op)) {
4870 Type *CstTy = Cst->getType();
4871 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4872 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
4873 return false;
4874
4875 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
4876 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4877 if ((SizeInBits % SrcEltSizeInBits) != 0)
4878 return false;
4879
4880 APInt UndefSrcElts(NumSrcElts, 0);
4881 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
4882 for (unsigned i = 0; i != NumSrcElts; ++i)
4883 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
4884 UndefSrcElts, i))
4885 return false;
4886
4887 return CastBitData(UndefSrcElts, SrcEltBits);
4888 }
4889
4890 // Extract constant bits from a broadcasted constant pool scalar.
4891 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
4892 EltSizeInBits <= VT.getScalarSizeInBits()) {
4893 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4894 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
4895 return false;
4896
4897 SDValue Ptr = MemIntr->getBasePtr();
4899 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4900 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4901
4902 APInt UndefSrcElts(NumSrcElts, 0);
4903 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
4904 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
4905 if (UndefSrcElts[0])
4906 UndefSrcElts.setBits(0, NumSrcElts);
4907 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
4908 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
4909 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
4910 return CastBitData(UndefSrcElts, SrcEltBits);
4911 }
4912 }
4913 }
4914
4915 // Extract constant bits from a subvector broadcast.
4916 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
4917 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4918 SDValue Ptr = MemIntr->getBasePtr();
4919 // The source constant may be larger than the subvector broadcast,
4920 // ensure we extract the correct subvector constants.
4921 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
4922 Type *CstTy = Cst->getType();
4923 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4924 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
4925 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
4926 (SizeInBits % SubVecSizeInBits) != 0)
4927 return false;
4928 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
4929 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
4930 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
4931 APInt UndefSubElts(NumSubElts, 0);
4932 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
4933 APInt(CstEltSizeInBits, 0));
4934 for (unsigned i = 0; i != NumSubElts; ++i) {
4935 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
4936 UndefSubElts, i))
4937 return false;
4938 for (unsigned j = 1; j != NumSubVecs; ++j)
4939 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
4940 }
4941 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
4942 UndefSubElts);
4943 return CastBitData(UndefSubElts, SubEltBits);
4944 }
4945 }
4946
4947 // Extract a rematerialized scalar constant insertion.
4948 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
4949 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
4950 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
4951 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4952 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4953
4954 APInt UndefSrcElts(NumSrcElts, 0);
4955 SmallVector<APInt, 64> SrcEltBits;
4956 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
4957 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
4958 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
4959 return CastBitData(UndefSrcElts, SrcEltBits);
4960 }
4961
4962 // Insert constant bits from a base and sub vector sources.
4963 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
4964 // If bitcasts to larger elements we might lose track of undefs - don't
4965 // allow any to be safe.
4966 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4967 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
4968
4969 APInt UndefSrcElts, UndefSubElts;
4970 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
4971 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
4972 UndefSubElts, EltSubBits,
4973 AllowWholeUndefs && AllowUndefs,
4974 AllowPartialUndefs && AllowUndefs) &&
4975 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
4976 UndefSrcElts, EltSrcBits,
4977 AllowWholeUndefs && AllowUndefs,
4978 AllowPartialUndefs && AllowUndefs)) {
4979 unsigned BaseIdx = Op.getConstantOperandVal(2);
4980 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
4981 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
4982 EltSrcBits[BaseIdx + i] = EltSubBits[i];
4983 return CastBitData(UndefSrcElts, EltSrcBits);
4984 }
4985 }
4986
4987 // Extract constant bits from a subvector's source.
4988 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4989 // TODO - support extract_subvector through bitcasts.
4990 if (EltSizeInBits != VT.getScalarSizeInBits())
4991 return false;
4992
4993 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4994 UndefElts, EltBits, AllowWholeUndefs,
4995 AllowPartialUndefs)) {
4996 EVT SrcVT = Op.getOperand(0).getValueType();
4997 unsigned NumSrcElts = SrcVT.getVectorNumElements();
4998 unsigned NumSubElts = VT.getVectorNumElements();
4999 unsigned BaseIdx = Op.getConstantOperandVal(1);
5000 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5001 if ((BaseIdx + NumSubElts) != NumSrcElts)
5002 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5003 if (BaseIdx != 0)
5004 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5005 return true;
5006 }
5007 }
5008
5009 // Extract constant bits from shuffle node sources.
5010 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5011 // TODO - support shuffle through bitcasts.
5012 if (EltSizeInBits != VT.getScalarSizeInBits())
5013 return false;
5014
5015 ArrayRef<int> Mask = SVN->getMask();
5016 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5017 llvm::any_of(Mask, [](int M) { return M < 0; }))
5018 return false;
5019
5020 APInt UndefElts0, UndefElts1;
5021 SmallVector<APInt, 32> EltBits0, EltBits1;
5022 if (isAnyInRange(Mask, 0, NumElts) &&
5023 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5024 UndefElts0, EltBits0, AllowWholeUndefs,
5025 AllowPartialUndefs))
5026 return false;
5027 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5028 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5029 UndefElts1, EltBits1, AllowWholeUndefs,
5030 AllowPartialUndefs))
5031 return false;
5032
5033 UndefElts = APInt::getZero(NumElts);
5034 for (int i = 0; i != (int)NumElts; ++i) {
5035 int M = Mask[i];
5036 if (M < 0) {
5037 UndefElts.setBit(i);
5038 EltBits.push_back(APInt::getZero(EltSizeInBits));
5039 } else if (M < (int)NumElts) {
5040 if (UndefElts0[M])
5041 UndefElts.setBit(i);
5042 EltBits.push_back(EltBits0[M]);
5043 } else {
5044 if (UndefElts1[M - NumElts])
5045 UndefElts.setBit(i);
5046 EltBits.push_back(EltBits1[M - NumElts]);
5047 }
5048 }
5049 return true;
5050 }
5051
5052 return false;
5053}
5054
5055namespace llvm {
5056namespace X86 {
5057bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5058 APInt UndefElts;
5059 SmallVector<APInt, 16> EltBits;
5061 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5062 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5063 int SplatIndex = -1;
5064 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5065 if (UndefElts[i])
5066 continue;
5067 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5068 SplatIndex = -1;
5069 break;
5070 }
5071 SplatIndex = i;
5072 }
5073 if (0 <= SplatIndex) {
5074 SplatVal = EltBits[SplatIndex];
5075 return true;
5076 }
5077 }
5078
5079 return false;
5080}
5081} // namespace X86
5082} // namespace llvm
5083
5085 unsigned MaskEltSizeInBits,
5087 APInt &UndefElts) {
5088 // Extract the raw target constant bits.
5089 SmallVector<APInt, 64> EltBits;
5090 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5091 EltBits, /* AllowWholeUndefs */ true,
5092 /* AllowPartialUndefs */ false))
5093 return false;
5094
5095 // Insert the extracted elements into the mask.
5096 for (const APInt &Elt : EltBits)
5097 RawMask.push_back(Elt.getZExtValue());
5098
5099 return true;
5100}
5101
5102// Match not(xor X, -1) -> X.
5103// Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5104// Match not(extract_subvector(xor X, -1)) -> extract_subvector(X).
5105// Match not(concat_vectors(xor X, -1, xor Y, -1)) -> concat_vectors(X, Y).
5107 V = peekThroughBitcasts(V);
5108 if (V.getOpcode() == ISD::XOR &&
5109 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5110 isAllOnesConstant(V.getOperand(1))))
5111 return V.getOperand(0);
5112 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5113 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5114 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5115 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5116 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
5117 Not, V.getOperand(1));
5118 }
5119 }
5120 if (V.getOpcode() == X86ISD::PCMPGT &&
5121 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5122 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5123 V.getOperand(0).hasOneUse()) {
5124 APInt UndefElts;
5125 SmallVector<APInt> EltBits;
5126 if (getTargetConstantBitsFromNode(V.getOperand(0),
5127 V.getScalarValueSizeInBits(), UndefElts,
5128 EltBits)) {
5129 // Don't fold min_signed_value -> (min_signed_value - 1)
5130 bool MinSigned = false;
5131 for (APInt &Elt : EltBits) {
5132 MinSigned |= Elt.isMinSignedValue();
5133 Elt -= 1;
5134 }
5135 if (!MinSigned) {
5136 SDLoc DL(V);
5137 MVT VT = V.getSimpleValueType();
5138 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5139 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5140 }
5141 }
5142 }
5144 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5145 for (SDValue &CatOp : CatOps) {
5146 SDValue NotCat = IsNOT(CatOp, DAG);
5147 if (!NotCat) return SDValue();
5148 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5149 }
5150 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
5151 }
5152 return SDValue();
5153}
5154
5155/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5156/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5157/// Note: This ignores saturation, so inputs must be checked first.
5159 bool Unary, unsigned NumStages = 1) {
5160 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5161 unsigned NumElts = VT.getVectorNumElements();
5162 unsigned NumLanes = VT.getSizeInBits() / 128;
5163 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5164 unsigned Offset = Unary ? 0 : NumElts;
5165 unsigned Repetitions = 1u << (NumStages - 1);
5166 unsigned Increment = 1u << NumStages;
5167 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5168
5169 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5170 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5171 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5172 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5173 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5174 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5175 }
5176 }
5177}
5178
5179// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5180static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5181 APInt &DemandedLHS, APInt &DemandedRHS) {
5182 int NumLanes = VT.getSizeInBits() / 128;
5183 int NumElts = DemandedElts.getBitWidth();
5184 int NumInnerElts = NumElts / 2;
5185 int NumEltsPerLane = NumElts / NumLanes;
5186 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5187
5188 DemandedLHS = APInt::getZero(NumInnerElts);
5189 DemandedRHS = APInt::getZero(NumInnerElts);
5190
5191 // Map DemandedElts to the packed operands.
5192 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5193 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5194 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5195 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5196 if (DemandedElts[OuterIdx])
5197 DemandedLHS.setBit(InnerIdx);
5198 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5199 DemandedRHS.setBit(InnerIdx);
5200 }
5201 }
5202}
5203
5204// Split the demanded elts of a HADD/HSUB node between its operands.
5205static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5206 APInt &DemandedLHS, APInt &DemandedRHS) {
5208 DemandedLHS, DemandedRHS);
5209 DemandedLHS |= DemandedLHS << 1;
5210 DemandedRHS |= DemandedRHS << 1;
5211}
5212
5213/// Calculates the shuffle mask corresponding to the target-specific opcode.
5214/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5215/// operands in \p Ops, and returns true.
5216/// Sets \p IsUnary to true if only one source is used. Note that this will set
5217/// IsUnary for shuffles which use a single input multiple times, and in those
5218/// cases it will adjust the mask to only have indices within that single input.
5219/// It is an error to call this with non-empty Mask/Ops vectors.
5220static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5222 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5223 if (!isTargetShuffle(N.getOpcode()))
5224 return false;
5225
5226 MVT VT = N.getSimpleValueType();
5227 unsigned NumElems = VT.getVectorNumElements();
5228 unsigned MaskEltSize = VT.getScalarSizeInBits();
5230 APInt RawUndefs;
5231 uint64_t ImmN;
5232
5233 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5234 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5235
5236 IsUnary = false;
5237 bool IsFakeUnary = false;
5238 switch (N.getOpcode()) {
5239 case X86ISD::BLENDI:
5240 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5241 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5242 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5243 DecodeBLENDMask(NumElems, ImmN, Mask);
5244 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5245 break;
5246 case X86ISD::SHUFP:
5247 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5248 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5249 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5250 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5251 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5252 break;
5253 case X86ISD::INSERTPS:
5254 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5255 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5256 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5257 DecodeINSERTPSMask(ImmN, Mask);
5258 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5259 break;
5260 case X86ISD::EXTRQI:
5261 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5262 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5263 isa<ConstantSDNode>(N.getOperand(2))) {
5264 int BitLen = N.getConstantOperandVal(1);
5265 int BitIdx = N.getConstantOperandVal(2);
5266 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5267 IsUnary = true;
5268 }
5269 break;
5270 case X86ISD::INSERTQI:
5271 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5272 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5273 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5274 isa<ConstantSDNode>(N.getOperand(3))) {
5275 int BitLen = N.getConstantOperandVal(2);
5276 int BitIdx = N.getConstantOperandVal(3);
5277 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5278 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5279 }
5280 break;
5281 case X86ISD::UNPCKH:
5282 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5283 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5284 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5285 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5286 break;
5287 case X86ISD::UNPCKL:
5288 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5289 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5290 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5291 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5292 break;
5293 case X86ISD::MOVHLPS:
5294 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5295 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5296 DecodeMOVHLPSMask(NumElems, Mask);
5297 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5298 break;
5299 case X86ISD::MOVLHPS:
5300 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5301 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5302 DecodeMOVLHPSMask(NumElems, Mask);
5303 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5304 break;
5305 case X86ISD::VALIGN:
5306 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5307 "Only 32-bit and 64-bit elements are supported!");
5308 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5309 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5310 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5311 DecodeVALIGNMask(NumElems, ImmN, Mask);
5312 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5313 Ops.push_back(N.getOperand(1));
5314 Ops.push_back(N.getOperand(0));
5315 break;
5316 case X86ISD::PALIGNR:
5317 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5318 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5319 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5320 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5321 DecodePALIGNRMask(NumElems, ImmN, Mask);
5322 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5323 Ops.push_back(N.getOperand(1));
5324 Ops.push_back(N.getOperand(0));
5325 break;
5326 case X86ISD::VSHLDQ:
5327 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5328 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5329 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5330 DecodePSLLDQMask(NumElems, ImmN, Mask);
5331 IsUnary = true;
5332 break;
5333 case X86ISD::VSRLDQ:
5334 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5335 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5336 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5337 DecodePSRLDQMask(NumElems, ImmN, Mask);
5338 IsUnary = true;
5339 break;
5340 case X86ISD::PSHUFD:
5341 case X86ISD::VPERMILPI:
5342 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5343 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5344 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5345 IsUnary = true;
5346 break;
5347 case X86ISD::PSHUFHW:
5348 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5349 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5350 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5351 IsUnary = true;
5352 break;
5353 case X86ISD::PSHUFLW:
5354 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5355 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5356 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5357 IsUnary = true;
5358 break;
5359 case X86ISD::VZEXT_MOVL:
5360 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5361 DecodeZeroMoveLowMask(NumElems, Mask);
5362 IsUnary = true;
5363 break;
5364 case X86ISD::VBROADCAST:
5365 // We only decode broadcasts of same-sized vectors, peeking through to
5366 // extracted subvectors is likely to cause hasOneUse issues with
5367 // SimplifyDemandedBits etc.
5368 if (N.getOperand(0).getValueType() == VT) {
5369 DecodeVectorBroadcast(NumElems, Mask);
5370 IsUnary = true;
5371 break;
5372 }
5373 return false;
5374 case X86ISD::VPERMILPV: {
5375 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5376 IsUnary = true;
5377 SDValue MaskNode = N.getOperand(1);
5378 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5379 RawUndefs)) {
5380 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5381 break;
5382 }
5383 return false;
5384 }
5385 case X86ISD::PSHUFB: {
5386 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5387 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5388 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5389 IsUnary = true;
5390 SDValue MaskNode = N.getOperand(1);
5391 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5392 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5393 break;
5394 }
5395 return false;
5396 }
5397 case X86ISD::VPERMI:
5398 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5399 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5400 DecodeVPERMMask(NumElems, ImmN, Mask);
5401 IsUnary = true;
5402 break;
5403 case X86ISD::MOVSS:
5404 case X86ISD::MOVSD:
5405 case X86ISD::MOVSH:
5406 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5407 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5408 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5409 break;
5410 case X86ISD::VPERM2X128:
5411 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5412 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5413 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5414 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5415 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5416 break;
5417 case X86ISD::SHUF128:
5418 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5419 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5420 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5421 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5422 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5423 break;
5424 case X86ISD::MOVSLDUP:
5425 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5426 DecodeMOVSLDUPMask(NumElems, Mask);
5427 IsUnary = true;
5428 break;
5429 case X86ISD::MOVSHDUP:
5430 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5431 DecodeMOVSHDUPMask(NumElems, Mask);
5432 IsUnary = true;
5433 break;
5434 case X86ISD::MOVDDUP:
5435 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5436 DecodeMOVDDUPMask(NumElems, Mask);
5437 IsUnary = true;
5438 break;
5439 case X86ISD::VPERMIL2: {
5440 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5441 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5442 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5443 SDValue MaskNode = N.getOperand(2);
5444 SDValue CtrlNode = N.getOperand(3);
5445 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5446 unsigned CtrlImm = CtrlOp->getZExtValue();
5447 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5448 RawUndefs)) {
5449 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5450 Mask);
5451 break;
5452 }
5453 }
5454 return false;
5455 }
5456 case X86ISD::VPPERM: {
5457 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5458 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5459 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5460 SDValue MaskNode = N.getOperand(2);
5461 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5462 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5463 break;
5464 }
5465 return false;
5466 }
5467 case X86ISD::VPERMV: {
5468 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5469 IsUnary = true;
5470 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5471 Ops.push_back(N.getOperand(1));
5472 SDValue MaskNode = N.getOperand(0);
5473 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5474 RawUndefs)) {
5475 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5476 break;
5477 }
5478 return false;
5479 }
5480 case X86ISD::VPERMV3: {
5481 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5482 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5483 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5484 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5485 Ops.push_back(N.getOperand(0));
5486 Ops.push_back(N.getOperand(2));
5487 SDValue MaskNode = N.getOperand(1);
5488 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5489 RawUndefs)) {
5490 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5491 break;
5492 }
5493 return false;
5494 }
5495 default:
5496 llvm_unreachable("unknown target shuffle node");
5497 }
5498
5499 // Empty mask indicates the decode failed.
5500 if (Mask.empty())
5501 return false;
5502
5503 // Check if we're getting a shuffle mask with zero'd elements.
5504 if (!AllowSentinelZero && isAnyZero(Mask))
5505 return false;
5506
5507 // If we have a fake unary shuffle, the shuffle mask is spread across two
5508 // inputs that are actually the same node. Re-map the mask to always point
5509 // into the first input.
5510 if (IsFakeUnary)
5511 for (int &M : Mask)
5512 if (M >= (int)Mask.size())
5513 M -= Mask.size();
5514
5515 // If we didn't already add operands in the opcode-specific code, default to
5516 // adding 1 or 2 operands starting at 0.
5517 if (Ops.empty()) {
5518 Ops.push_back(N.getOperand(0));
5519 if (!IsUnary || IsFakeUnary)
5520 Ops.push_back(N.getOperand(1));
5521 }
5522
5523 return true;
5524}
5525
5526// Wrapper for getTargetShuffleMask with InUnary;
5527static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5529 SmallVectorImpl<int> &Mask) {
5530 bool IsUnary;
5531 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5532}
5533
5534/// Compute whether each element of a shuffle is zeroable.
5535///
5536/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5537/// Either it is an undef element in the shuffle mask, the element of the input
5538/// referenced is undef, or the element of the input referenced is known to be
5539/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5540/// as many lanes with this technique as possible to simplify the remaining
5541/// shuffle.
5543 SDValue V1, SDValue V2,
5544 APInt &KnownUndef, APInt &KnownZero) {
5545 int Size = Mask.size();
5546 KnownUndef = KnownZero = APInt::getZero(Size);
5547
5548 V1 = peekThroughBitcasts(V1);
5549 V2 = peekThroughBitcasts(V2);
5550
5551 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5552 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5553
5554 int VectorSizeInBits = V1.getValueSizeInBits();
5555 int ScalarSizeInBits = VectorSizeInBits / Size;
5556 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5557
5558 for (int i = 0; i < Size; ++i) {
5559 int M = Mask[i];
5560 // Handle the easy cases.
5561 if (M < 0) {
5562 KnownUndef.setBit(i);
5563 continue;
5564 }
5565 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5566 KnownZero.setBit(i);
5567 continue;
5568 }
5569
5570 // Determine shuffle input and normalize the mask.
5571 SDValue V = M < Size ? V1 : V2;
5572 M %= Size;
5573
5574 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5575 if (V.getOpcode() != ISD::BUILD_VECTOR)
5576 continue;
5577
5578 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5579 // the (larger) source element must be UNDEF/ZERO.
5580 if ((Size % V.getNumOperands()) == 0) {
5581 int Scale = Size / V->getNumOperands();
5582 SDValue Op = V.getOperand(M / Scale);
5583 if (Op.isUndef())
5584 KnownUndef.setBit(i);
5585 if (X86::isZeroNode(Op))
5586 KnownZero.setBit(i);
5587 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5588 APInt Val = Cst->getAPIntValue();
5589 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5590 if (Val == 0)
5591 KnownZero.setBit(i);
5592 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5593 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5594 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5595 if (Val == 0)
5596 KnownZero.setBit(i);
5597 }
5598 continue;
5599 }
5600
5601 // If the BUILD_VECTOR has more elements then all the (smaller) source
5602 // elements must be UNDEF or ZERO.
5603 if ((V.getNumOperands() % Size) == 0) {
5604 int Scale = V->getNumOperands() / Size;
5605 bool AllUndef = true;
5606 bool AllZero = true;
5607 for (int j = 0; j < Scale; ++j) {
5608 SDValue Op = V.getOperand((M * Scale) + j);
5609 AllUndef &= Op.isUndef();
5610 AllZero &= X86::isZeroNode(Op);
5611 }
5612 if (AllUndef)
5613 KnownUndef.setBit(i);
5614 if (AllZero)
5615 KnownZero.setBit(i);
5616 continue;
5617 }
5618 }
5619}
5620
5621/// Decode a target shuffle mask and inputs and see if any values are
5622/// known to be undef or zero from their inputs.
5623/// Returns true if the target shuffle mask was decoded.
5624/// FIXME: Merge this with computeZeroableShuffleElements?
5627 APInt &KnownUndef, APInt &KnownZero) {
5628 bool IsUnary;
5629 if (!isTargetShuffle(N.getOpcode()))
5630 return false;
5631
5632 MVT VT = N.getSimpleValueType();
5633 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5634 return false;
5635
5636 int Size = Mask.size();
5637 SDValue V1 = Ops[0];
5638 SDValue V2 = IsUnary ? V1 : Ops[1];
5639 KnownUndef = KnownZero = APInt::getZero(Size);
5640
5641 V1 = peekThroughBitcasts(V1);
5642 V2 = peekThroughBitcasts(V2);
5643
5644 assert((VT.getSizeInBits() % Size) == 0 &&
5645 "Illegal split of shuffle value type");
5646 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5647
5648 // Extract known constant input data.
5649 APInt UndefSrcElts[2];
5650 SmallVector<APInt, 32> SrcEltBits[2];
5651 bool IsSrcConstant[2] = {
5652 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5653 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5654 /*AllowPartialUndefs*/ false),
5655 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5656 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5657 /*AllowPartialUndefs*/ false)};
5658
5659 for (int i = 0; i < Size; ++i) {
5660 int M = Mask[i];
5661
5662 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5663 if (M < 0) {
5664 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5665 if (SM_SentinelUndef == M)
5666 KnownUndef.setBit(i);
5667 if (SM_SentinelZero == M)
5668 KnownZero.setBit(i);
5669 continue;
5670 }
5671
5672 // Determine shuffle input and normalize the mask.
5673 unsigned SrcIdx = M / Size;
5674 SDValue V = M < Size ? V1 : V2;
5675 M %= Size;
5676
5677 // We are referencing an UNDEF input.
5678 if (V.isUndef()) {
5679 KnownUndef.setBit(i);
5680 continue;
5681 }
5682
5683 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5684 // TODO: We currently only set UNDEF for integer types - floats use the same
5685 // registers as vectors and many of the scalar folded loads rely on the
5686 // SCALAR_TO_VECTOR pattern.
5687 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5688 (Size % V.getValueType().getVectorNumElements()) == 0) {
5689 int Scale = Size / V.getValueType().getVectorNumElements();
5690 int Idx = M / Scale;
5691 if (Idx != 0 && !VT.isFloatingPoint())
5692 KnownUndef.setBit(i);
5693 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5694 KnownZero.setBit(i);
5695 continue;
5696 }
5697
5698 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5699 // base vectors.
5700 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5701 SDValue Vec = V.getOperand(0);
5702 int NumVecElts = Vec.getValueType().getVectorNumElements();
5703 if (Vec.isUndef() && Size == NumVecElts) {
5704 int Idx = V.getConstantOperandVal(2);
5705 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5706 if (M < Idx || (Idx + NumSubElts) <= M)
5707 KnownUndef.setBit(i);
5708 }
5709 continue;
5710 }
5711
5712 // Attempt to extract from the source's constant bits.
5713 if (IsSrcConstant[SrcIdx]) {
5714 if (UndefSrcElts[SrcIdx][M])
5715 KnownUndef.setBit(i);
5716 else if (SrcEltBits[SrcIdx][M] == 0)
5717 KnownZero.setBit(i);
5718 }
5719 }
5720
5721 assert(VT.getVectorNumElements() == (unsigned)Size &&
5722 "Different mask size from vector size!");
5723 return true;
5724}
5725
5726// Replace target shuffle mask elements with known undef/zero sentinels.
5728 const APInt &KnownUndef,
5729 const APInt &KnownZero,
5730 bool ResolveKnownZeros= true) {
5731 unsigned NumElts = Mask.size();
5732 assert(KnownUndef.getBitWidth() == NumElts &&
5733 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5734
5735 for (unsigned i = 0; i != NumElts; ++i) {
5736 if (KnownUndef[i])
5737 Mask[i] = SM_SentinelUndef;
5738 else if (ResolveKnownZeros && KnownZero[i])
5739 Mask[i] = SM_SentinelZero;
5740 }
5741}
5742
5743// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5745 APInt &KnownUndef,
5746 APInt &KnownZero) {
5747 unsigned NumElts = Mask.size();
5748 KnownUndef = KnownZero = APInt::getZero(NumElts);
5749
5750 for (unsigned i = 0; i != NumElts; ++i) {
5751 int M = Mask[i];
5752 if (SM_SentinelUndef == M)
5753 KnownUndef.setBit(i);
5754 if (SM_SentinelZero == M)
5755 KnownZero.setBit(i);
5756 }
5757}
5758
5759// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5761 SDValue Cond, bool IsBLENDV = false) {
5762 EVT CondVT = Cond.getValueType();
5763 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5764 unsigned NumElts = CondVT.getVectorNumElements();
5765
5766 APInt UndefElts;
5767 SmallVector<APInt, 32> EltBits;
5768 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5769 /*AllowWholeUndefs*/ true,
5770 /*AllowPartialUndefs*/ false))
5771 return false;
5772
5773 Mask.resize(NumElts, SM_SentinelUndef);
5774
5775 for (int i = 0; i != (int)NumElts; ++i) {
5776 Mask[i] = i;
5777 // Arbitrarily choose from the 2nd operand if the select condition element
5778 // is undef.
5779 // TODO: Can we do better by matching patterns such as even/odd?
5780 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5781 (IsBLENDV && EltBits[i].isNonNegative()))
5782 Mask[i] += NumElts;
5783 }
5784
5785 return true;
5786}
5787
5788// Forward declaration (for getFauxShuffleMask recursive check).
5789static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5792 const SelectionDAG &DAG, unsigned Depth,
5793 bool ResolveKnownElts);
5794
5795// Attempt to decode ops that could be represented as a shuffle mask.
5796// The decoded shuffle mask may contain a different number of elements to the
5797// destination value type.
5798// TODO: Merge into getTargetShuffleInputs()
5799static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
5802 const SelectionDAG &DAG, unsigned Depth,
5803 bool ResolveKnownElts) {
5804 Mask.clear();
5805 Ops.clear();
5806
5807 MVT VT = N.getSimpleValueType();
5808 unsigned NumElts = VT.getVectorNumElements();
5809 unsigned NumSizeInBits = VT.getSizeInBits();
5810 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5811 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
5812 return false;
5813 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
5814 unsigned NumSizeInBytes = NumSizeInBits / 8;
5815 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5816
5817 unsigned Opcode = N.getOpcode();
5818 switch (Opcode) {
5819 case ISD::VECTOR_SHUFFLE: {
5820 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
5821 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
5822 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
5823 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
5824 Ops.push_back(N.getOperand(0));
5825 Ops.push_back(N.getOperand(1));
5826 return true;
5827 }
5828 return false;
5829 }
5830 case ISD::AND:
5831 case X86ISD::ANDNP: {
5832 // Attempt to decode as a per-byte mask.
5833 APInt UndefElts;
5834 SmallVector<APInt, 32> EltBits;
5835 SDValue N0 = N.getOperand(0);
5836 SDValue N1 = N.getOperand(1);
5837 bool IsAndN = (X86ISD::ANDNP == Opcode);
5838 uint64_t ZeroMask = IsAndN ? 255 : 0;
5839 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
5840 /*AllowWholeUndefs*/ false,
5841 /*AllowPartialUndefs*/ false))
5842 return false;
5843 // We can't assume an undef src element gives an undef dst - the other src
5844 // might be zero.
5845 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
5846 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5847 const APInt &ByteBits = EltBits[i];
5848 if (ByteBits != 0 && ByteBits != 255)
5849 return false;
5850 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5851 }
5852 Ops.push_back(IsAndN ? N1 : N0);
5853 return true;
5854 }
5855 case ISD::OR: {
5856 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
5857 // is a valid shuffle index.
5858 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
5859 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
5860 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
5861 return false;
5862
5863 SmallVector<int, 64> SrcMask0, SrcMask1;
5864 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
5867 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
5868 Depth + 1, true) ||
5869 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
5870 Depth + 1, true))
5871 return false;
5872
5873 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
5874 SmallVector<int, 64> Mask0, Mask1;
5875 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
5876 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
5877 for (int i = 0; i != (int)MaskSize; ++i) {
5878 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
5879 // loops converting between OR and BLEND shuffles due to
5880 // canWidenShuffleElements merging away undef elements, meaning we
5881 // fail to recognise the OR as the undef element isn't known zero.
5882 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
5883 Mask.push_back(SM_SentinelZero);
5884 else if (Mask1[i] == SM_SentinelZero)
5885 Mask.push_back(i);
5886 else if (Mask0[i] == SM_SentinelZero)
5887 Mask.push_back(i + MaskSize);
5888 else
5889 return false;
5890 }
5891 Ops.push_back(N0);
5892 Ops.push_back(N1);
5893 return true;
5894 }
5895 case ISD::INSERT_SUBVECTOR: {
5896 SDValue Src = N.getOperand(0);
5897 SDValue Sub = N.getOperand(1);
5898 EVT SubVT = Sub.getValueType();
5899 unsigned NumSubElts = SubVT.getVectorNumElements();
5900 if (!N->isOnlyUserOf(Sub.getNode()))
5901 return false;
5902 SDValue SubBC = peekThroughBitcasts(Sub);
5903 uint64_t InsertIdx = N.getConstantOperandVal(2);
5904 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
5905 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5906 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
5907 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
5908 SDValue SubBCSrc = SubBC.getOperand(0);
5909 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
5910 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
5911 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
5912 "Subvector valuetype mismatch");
5913 InsertIdx *= (MaxElts / NumElts);
5914 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
5915 NumSubElts *= (MaxElts / NumElts);
5916 bool SrcIsUndef = Src.isUndef();
5917 for (int i = 0; i != (int)MaxElts; ++i)
5918 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
5919 for (int i = 0; i != (int)NumSubElts; ++i)
5920 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
5921 if (!SrcIsUndef)
5922 Ops.push_back(Src);
5923 Ops.push_back(SubBCSrc);
5924 return true;
5925 }
5926 // Handle CONCAT(SUB0, SUB1).
5927 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
5928 // cross lane shuffles.
5929 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
5930 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
5931 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5932 Src.getOperand(0).isUndef() &&
5933 Src.getOperand(1).getValueType() == SubVT &&
5934 Src.getConstantOperandVal(2) == 0) {
5935 for (int i = 0; i != (int)NumSubElts; ++i)
5936 Mask.push_back(i);
5937 for (int i = 0; i != (int)NumSubElts; ++i)
5938 Mask.push_back(i + NumElts);
5939 Ops.push_back(Src.getOperand(1));
5940 Ops.push_back(Sub);
5941 return true;
5942 }
5943 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
5944 SmallVector<int, 64> SubMask;
5945 SmallVector<SDValue, 2> SubInputs;
5946 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
5947 EVT SubSrcVT = SubSrc.getValueType();
5948 if (!SubSrcVT.isVector())
5949 return false;
5950
5951 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
5952 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
5953 Depth + 1, ResolveKnownElts))
5954 return false;
5955
5956 // Subvector shuffle inputs must not be larger than the subvector.
5957 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
5958 return SubVT.getFixedSizeInBits() <
5959 SubInput.getValueSizeInBits().getFixedValue();
5960 }))
5961 return false;
5962
5963 if (SubMask.size() != NumSubElts) {
5964 assert(((SubMask.size() % NumSubElts) == 0 ||
5965 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
5966 if ((NumSubElts % SubMask.size()) == 0) {
5967 int Scale = NumSubElts / SubMask.size();
5968 SmallVector<int,64> ScaledSubMask;
5969 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
5970 SubMask = ScaledSubMask;
5971 } else {
5972 int Scale = SubMask.size() / NumSubElts;
5973 NumSubElts = SubMask.size();
5974 NumElts *= Scale;
5975 InsertIdx *= Scale;
5976 }
5977 }
5978 Ops.push_back(Src);
5979 Ops.append(SubInputs.begin(), SubInputs.end());
5980 if (ISD::isBuildVectorAllZeros(Src.getNode()))
5981 Mask.append(NumElts, SM_SentinelZero);
5982 else
5983 for (int i = 0; i != (int)NumElts; ++i)
5984 Mask.push_back(i);
5985 for (int i = 0; i != (int)NumSubElts; ++i) {
5986 int M = SubMask[i];
5987 if (0 <= M) {
5988 int InputIdx = M / NumSubElts;
5989 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
5990 }
5991 Mask[i + InsertIdx] = M;
5992 }
5993 return true;
5994 }
5995 case X86ISD::PINSRB:
5996 case X86ISD::PINSRW:
5999 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6000 // vector, for matching src/dst vector types.
6001 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6002
6003 unsigned DstIdx = 0;
6004 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6005 // Check we have an in-range constant insertion index.
6006 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6007 N.getConstantOperandAPInt(2).uge(NumElts))
6008 return false;
6009 DstIdx = N.getConstantOperandVal(2);
6010
6011 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6012 if (X86::isZeroNode(Scl)) {
6013 Ops.push_back(N.getOperand(0));
6014 for (unsigned i = 0; i != NumElts; ++i)
6015 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6016 return true;
6017 }
6018 }
6019
6020 // Peek through trunc/aext/zext/bitcast.
6021 // TODO: aext shouldn't require SM_SentinelZero padding.
6022 // TODO: handle shift of scalars.
6023 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6024 while (Scl.getOpcode() == ISD::TRUNCATE ||
6025 Scl.getOpcode() == ISD::ANY_EXTEND ||
6026 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6027 (Scl.getOpcode() == ISD::BITCAST &&
6030 Scl = Scl.getOperand(0);
6031 MinBitsPerElt =
6032 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6033 }
6034 if ((MinBitsPerElt % 8) != 0)
6035 return false;
6036
6037 // Attempt to find the source vector the scalar was extracted from.
6038 SDValue SrcExtract;
6039 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6040 Scl.getOpcode() == X86ISD::PEXTRW ||
6041 Scl.getOpcode() == X86ISD::PEXTRB) &&
6042 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6043 SrcExtract = Scl;
6044 }
6045 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6046 return false;
6047
6048 SDValue SrcVec = SrcExtract.getOperand(0);
6049 EVT SrcVT = SrcVec.getValueType();
6050 if (!SrcVT.getScalarType().isByteSized())
6051 return false;
6052 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6053 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6054 unsigned DstByte = DstIdx * NumBytesPerElt;
6055 MinBitsPerElt =
6056 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6057
6058 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6059 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6060 Ops.push_back(SrcVec);
6061 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6062 } else {
6063 Ops.push_back(SrcVec);
6064 Ops.push_back(N.getOperand(0));
6065 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6066 Mask.push_back(NumSizeInBytes + i);
6067 }
6068
6069 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6070 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6071 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6072 Mask[DstByte + i] = SrcByte + i;
6073 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6074 Mask[DstByte + i] = SM_SentinelZero;
6075 return true;
6076 }
6077 case X86ISD::PACKSS:
6078 case X86ISD::PACKUS: {
6079 SDValue N0 = N.getOperand(0);
6080 SDValue N1 = N.getOperand(1);
6081 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6082 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6083 "Unexpected input value type");
6084
6085 APInt EltsLHS, EltsRHS;
6086 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6087
6088 // If we know input saturation won't happen (or we don't care for particular
6089 // lanes), we can treat this as a truncation shuffle.
6090 bool Offset0 = false, Offset1 = false;
6091 if (Opcode == X86ISD::PACKSS) {
6092 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6093 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6094 (!(N1.isUndef() || EltsRHS.isZero()) &&
6095 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6096 return false;
6097 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6098 // PACKSS then it was likely being used for sign-extension for a
6099 // truncation, so just peek through and adjust the mask accordingly.
6100 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6101 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6102 Offset0 = true;
6103 N0 = N0.getOperand(0);
6104 }
6105 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6106 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6107 Offset1 = true;
6108 N1 = N1.getOperand(0);
6109 }
6110 } else {
6111 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6112 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6113 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6114 (!(N1.isUndef() || EltsRHS.isZero()) &&
6115 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6116 return false;
6117 }
6118
6119 bool IsUnary = (N0 == N1);
6120
6121 Ops.push_back(N0);
6122 if (!IsUnary)
6123 Ops.push_back(N1);
6124
6125 createPackShuffleMask(VT, Mask, IsUnary);
6126
6127 if (Offset0 || Offset1) {
6128 for (int &M : Mask)
6129 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6130 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6131 ++M;
6132 }
6133 return true;
6134 }
6135 case ISD::VSELECT:
6136 case X86ISD::BLENDV: {
6137 SDValue Cond = N.getOperand(0);
6138 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6139 Ops.push_back(N.getOperand(1));
6140 Ops.push_back(N.getOperand(2));
6141 return true;
6142 }
6143 return false;
6144 }
6145 case X86ISD::VTRUNC: {
6146 SDValue Src = N.getOperand(0);
6147 EVT SrcVT = Src.getValueType();
6148 // Truncated source must be a simple vector.
6149 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6150 (SrcVT.getScalarSizeInBits() % 8) != 0)
6151 return false;
6152 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6153 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6154 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6155 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6156 for (unsigned i = 0; i != NumSrcElts; ++i)
6157 Mask.push_back(i * Scale);
6158 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6159 Ops.push_back(Src);
6160 return true;
6161 }
6162 case X86ISD::VSHLI:
6163 case X86ISD::VSRLI: {
6164 uint64_t ShiftVal = N.getConstantOperandVal(1);
6165 // Out of range bit shifts are guaranteed to be zero.
6166 if (NumBitsPerElt <= ShiftVal) {
6167 Mask.append(NumElts, SM_SentinelZero);
6168 return true;
6169 }
6170
6171 // We can only decode 'whole byte' bit shifts as shuffles.
6172 if ((ShiftVal % 8) != 0)
6173 break;
6174
6175 uint64_t ByteShift = ShiftVal / 8;
6176 Ops.push_back(N.getOperand(0));
6177
6178 // Clear mask to all zeros and insert the shifted byte indices.
6179 Mask.append(NumSizeInBytes, SM_SentinelZero);
6180
6181 if (X86ISD::VSHLI == Opcode) {
6182 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6183 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6184 Mask[i + j] = i + j - ByteShift;
6185 } else {
6186 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6187 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6188 Mask[i + j - ByteShift] = i + j;
6189 }
6190 return true;
6191 }
6192 case X86ISD::VROTLI:
6193 case X86ISD::VROTRI: {
6194 // We can only decode 'whole byte' bit rotates as shuffles.
6195 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6196 if ((RotateVal % 8) != 0)
6197 return false;
6198 Ops.push_back(N.getOperand(0));
6199 int Offset = RotateVal / 8;
6200 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6201 for (int i = 0; i != (int)NumElts; ++i) {
6202 int BaseIdx = i * NumBytesPerElt;
6203 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6204 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6205 }
6206 }
6207 return true;
6208 }
6209 case X86ISD::VBROADCAST: {
6210 SDValue Src = N.getOperand(0);
6211 if (!Src.getSimpleValueType().isVector()) {
6212 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6213 !isNullConstant(Src.getOperand(1)) ||
6214 Src.getOperand(0).getValueType().getScalarType() !=
6215 VT.getScalarType())
6216 return false;
6217 Src = Src.getOperand(0);
6218 }
6219 Ops.push_back(Src);
6220 Mask.append(NumElts, 0);
6221 return true;
6222 }
6224 SDValue Src = N.getOperand(0);
6225 EVT SrcVT = Src.getValueType();
6226 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6227
6228 // Extended source must be a simple vector.
6229 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6230 (NumBitsPerSrcElt % 8) != 0)
6231 return false;
6232
6233 // We can only handle all-signbits extensions.
6234 APInt DemandedSrcElts =
6235 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6236 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6237 return false;
6238
6239 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6240 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6241 for (unsigned I = 0; I != NumElts; ++I)
6242 Mask.append(Scale, I);
6243 Ops.push_back(Src);
6244 return true;
6245 }
6246 case ISD::ZERO_EXTEND:
6247 case ISD::ANY_EXTEND:
6250 SDValue Src = N.getOperand(0);
6251 EVT SrcVT = Src.getValueType();
6252
6253 // Extended source must be a simple vector.
6254 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6255 (SrcVT.getScalarSizeInBits() % 8) != 0)
6256 return false;
6257
6258 bool IsAnyExtend =
6259 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6260 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6261 IsAnyExtend, Mask);
6262 Ops.push_back(Src);
6263 return true;
6264 }
6265 }
6266
6267 return false;
6268}
6269
6270/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6272 SmallVectorImpl<int> &Mask) {
6273 int MaskWidth = Mask.size();
6274 SmallVector<SDValue, 16> UsedInputs;
6275 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6276 int lo = UsedInputs.size() * MaskWidth;
6277 int hi = lo + MaskWidth;
6278
6279 // Strip UNDEF input usage.
6280 if (Inputs[i].isUndef())
6281 for (int &M : Mask)
6282 if ((lo <= M) && (M < hi))
6283 M = SM_SentinelUndef;
6284
6285 // Check for unused inputs.
6286 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6287 for (int &M : Mask)
6288 if (lo <= M)
6289 M -= MaskWidth;
6290 continue;
6291 }
6292
6293 // Check for repeated inputs.
6294 bool IsRepeat = false;
6295 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6296 if (UsedInputs[j] != Inputs[i])
6297 continue;
6298 for (int &M : Mask)
6299 if (lo <= M)
6300 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6301 IsRepeat = true;
6302 break;
6303 }
6304 if (IsRepeat)
6305 continue;
6306
6307 UsedInputs.push_back(Inputs[i]);
6308 }
6309 Inputs = UsedInputs;
6310}
6311
6312/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6313/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6314/// Returns true if the target shuffle mask was decoded.
6315static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6318 APInt &KnownUndef, APInt &KnownZero,
6319 const SelectionDAG &DAG, unsigned Depth,
6320 bool ResolveKnownElts) {
6322 return false; // Limit search depth.
6323
6324 EVT VT = Op.getValueType();
6325 if (!VT.isSimple() || !VT.isVector())
6326 return false;
6327
6328 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6329 if (ResolveKnownElts)
6330 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6331 return true;
6332 }
6333 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6334 ResolveKnownElts)) {
6335 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6336 return true;
6337 }
6338 return false;
6339}
6340
6341static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6344 const SelectionDAG &DAG, unsigned Depth,
6345 bool ResolveKnownElts) {
6346 APInt KnownUndef, KnownZero;
6347 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6348 KnownZero, DAG, Depth, ResolveKnownElts);
6349}
6350
6353 const SelectionDAG &DAG, unsigned Depth = 0,
6354 bool ResolveKnownElts = true) {
6355 EVT VT = Op.getValueType();
6356 if (!VT.isSimple() || !VT.isVector())
6357 return false;
6358
6359 unsigned NumElts = Op.getValueType().getVectorNumElements();
6360 APInt DemandedElts = APInt::getAllOnes(NumElts);
6361 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6362 ResolveKnownElts);
6363}
6364
6365// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6366static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6367 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6368 SelectionDAG &DAG) {
6369 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6370 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6371 "Unknown broadcast load type");
6372
6373 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6374 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6375 return SDValue();
6376
6379 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6380 SDValue Ops[] = {Mem->getChain(), Ptr};
6381 SDValue BcstLd = DAG.getMemIntrinsicNode(
6382 Opcode, DL, Tys, Ops, MemVT,
6384 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6385 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6386 return BcstLd;
6387}
6388
6389/// Returns the scalar element that will make up the i'th
6390/// element of the result of the vector shuffle.
6392 SelectionDAG &DAG, unsigned Depth) {
6394 return SDValue(); // Limit search depth.
6395
6396 EVT VT = Op.getValueType();
6397 unsigned Opcode = Op.getOpcode();
6398 unsigned NumElems = VT.getVectorNumElements();
6399
6400 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6401 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6402 int Elt = SV->getMaskElt(Index);
6403
6404 if (Elt < 0)
6405 return DAG.getUNDEF(VT.getVectorElementType());
6406
6407 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6408 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6409 }
6410
6411 // Recurse into target specific vector shuffles to find scalars.
6412 if (isTargetShuffle(Opcode)) {
6413 MVT ShufVT = VT.getSimpleVT();
6414 MVT ShufSVT = ShufVT.getVectorElementType();
6415 int NumElems = (int)ShufVT.getVectorNumElements();
6416 SmallVector<int, 16> ShuffleMask;
6418 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6419 return SDValue();
6420
6421 int Elt = ShuffleMask[Index];
6422 if (Elt == SM_SentinelZero)
6423 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6424 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6425 if (Elt == SM_SentinelUndef)
6426 return DAG.getUNDEF(ShufSVT);
6427
6428 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6429 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6430 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6431 }
6432
6433 // Recurse into insert_subvector base/sub vector to find scalars.
6434 if (Opcode == ISD::INSERT_SUBVECTOR) {
6435 SDValue Vec = Op.getOperand(0);
6436 SDValue Sub = Op.getOperand(1);
6437 uint64_t SubIdx = Op.getConstantOperandVal(2);
6438 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6439
6440 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6441 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6442 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6443 }
6444
6445 // Recurse into concat_vectors sub vector to find scalars.
6446 if (Opcode == ISD::CONCAT_VECTORS) {
6447 EVT SubVT = Op.getOperand(0).getValueType();
6448 unsigned NumSubElts = SubVT.getVectorNumElements();
6449 uint64_t SubIdx = Index / NumSubElts;
6450 uint64_t SubElt = Index % NumSubElts;
6451 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6452 }
6453
6454 // Recurse into extract_subvector src vector to find scalars.
6455 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6456 SDValue Src = Op.getOperand(0);
6457 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6458 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6459 }
6460
6461 // We only peek through bitcasts of the same vector width.
6462 if (Opcode == ISD::BITCAST) {
6463 SDValue Src = Op.getOperand(0);
6464 EVT SrcVT = Src.getValueType();
6465 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6466 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6467 return SDValue();
6468 }
6469
6470 // Actual nodes that may contain scalar elements
6471
6472 // For insert_vector_elt - either return the index matching scalar or recurse
6473 // into the base vector.
6474 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6475 isa<ConstantSDNode>(Op.getOperand(2))) {
6476 if (Op.getConstantOperandAPInt(2) == Index)
6477 return Op.getOperand(1);
6478 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6479 }
6480
6481 if (Opcode == ISD::SCALAR_TO_VECTOR)
6482 return (Index == 0) ? Op.getOperand(0)
6483 : DAG.getUNDEF(VT.getVectorElementType());
6484
6485 if (Opcode == ISD::BUILD_VECTOR)
6486 return Op.getOperand(Index);
6487
6488 return SDValue();
6489}
6490
6491// Use PINSRB/PINSRW/PINSRD to create a build vector.
6493 const APInt &NonZeroMask,
6494 unsigned NumNonZero, unsigned NumZero,
6495 SelectionDAG &DAG,
6496 const X86Subtarget &Subtarget) {
6497 MVT VT = Op.getSimpleValueType();
6498 unsigned NumElts = VT.getVectorNumElements();
6499 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6500 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6501 "Illegal vector insertion");
6502
6503 SDValue V;
6504 bool First = true;
6505
6506 for (unsigned i = 0; i < NumElts; ++i) {
6507 bool IsNonZero = NonZeroMask[i];
6508 if (!IsNonZero)
6509 continue;
6510
6511 // If the build vector contains zeros or our first insertion is not the
6512 // first index then insert into zero vector to break any register
6513 // dependency else use SCALAR_TO_VECTOR.
6514 if (First) {
6515 First = false;
6516 if (NumZero || 0 != i)
6517 V = getZeroVector(VT, Subtarget, DAG, DL);
6518 else {
6519 assert(0 == i && "Expected insertion into zero-index");
6520 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6521 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6522 V = DAG.getBitcast(VT, V);
6523 continue;
6524 }
6525 }
6526 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6527 DAG.getIntPtrConstant(i, DL));
6528 }
6529
6530 return V;
6531}
6532
6533/// Custom lower build_vector of v16i8.
6535 const APInt &NonZeroMask,
6536 unsigned NumNonZero, unsigned NumZero,
6537 SelectionDAG &DAG,
6538 const X86Subtarget &Subtarget) {
6539 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6540 return SDValue();
6541
6542 // SSE4.1 - use PINSRB to insert each byte directly.
6543 if (Subtarget.hasSSE41())
6544 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6545 DAG, Subtarget);
6546
6547 SDValue V;
6548
6549 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6550 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6551 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6552 !NonZeroMask.extractBits(2, 2).isZero()) {
6553 for (unsigned I = 0; I != 4; ++I) {
6554 if (!NonZeroMask[I])
6555 continue;
6556 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6557 if (I != 0)
6558 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6559 DAG.getConstant(I * 8, DL, MVT::i8));
6560 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6561 }
6562 assert(V && "Failed to fold v16i8 vector to zero");
6563 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6564 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6565 V = DAG.getBitcast(MVT::v8i16, V);
6566 }
6567 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6568 bool ThisIsNonZero = NonZeroMask[i];
6569 bool NextIsNonZero = NonZeroMask[i + 1];
6570 if (!ThisIsNonZero && !NextIsNonZero)
6571 continue;
6572
6573 SDValue Elt;
6574 if (ThisIsNonZero) {
6575 if (NumZero || NextIsNonZero)
6576 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6577 else
6578 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6579 }
6580
6581 if (NextIsNonZero) {
6582 SDValue NextElt = Op.getOperand(i + 1);
6583 if (i == 0 && NumZero)
6584 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6585 else
6586 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6587 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6588 DAG.getConstant(8, DL, MVT::i8));
6589 if (ThisIsNonZero)
6590 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6591 else
6592 Elt = NextElt;
6593 }
6594
6595 // If our first insertion is not the first index or zeros are needed, then
6596 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6597 // elements undefined).
6598 if (!V) {
6599 if (i != 0 || NumZero)
6600 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6601 else {
6602 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6603 V = DAG.getBitcast(MVT::v8i16, V);
6604 continue;
6605 }
6606 }
6607 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6608 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6609 DAG.getIntPtrConstant(i / 2, DL));
6610 }
6611
6612 return DAG.getBitcast(MVT::v16i8, V);
6613}
6614
6615/// Custom lower build_vector of v8i16.
6617 const APInt &NonZeroMask,
6618 unsigned NumNonZero, unsigned NumZero,
6619 SelectionDAG &DAG,
6620 const X86Subtarget &Subtarget) {
6621 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6622 return SDValue();
6623
6624 // Use PINSRW to insert each byte directly.
6625 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6626 Subtarget);
6627}
6628
6629/// Custom lower build_vector of v4i32 or v4f32.
6631 SelectionDAG &DAG,
6632 const X86Subtarget &Subtarget) {
6633 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6634 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6635 // Because we're creating a less complicated build vector here, we may enable
6636 // further folding of the MOVDDUP via shuffle transforms.
6637 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6638 Op.getOperand(0) == Op.getOperand(2) &&
6639 Op.getOperand(1) == Op.getOperand(3) &&
6640 Op.getOperand(0) != Op.getOperand(1)) {
6641 MVT VT = Op.getSimpleValueType();
6642 MVT EltVT = VT.getVectorElementType();
6643 // Create a new build vector with the first 2 elements followed by undef
6644 // padding, bitcast to v2f64, duplicate, and bitcast back.
6645 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6646 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6647 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6648 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6649 return DAG.getBitcast(VT, Dup);
6650 }
6651
6652 // Find all zeroable elements.
6653 std::bitset<4> Zeroable, Undefs;
6654 for (int i = 0; i < 4; ++i) {
6655 SDValue Elt = Op.getOperand(i);
6656 Undefs[i] = Elt.isUndef();
6657 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6658 }
6659 assert(Zeroable.size() - Zeroable.count() > 1 &&
6660 "We expect at least two non-zero elements!");
6661
6662 // We only know how to deal with build_vector nodes where elements are either
6663 // zeroable or extract_vector_elt with constant index.
6664 SDValue FirstNonZero;
6665 unsigned FirstNonZeroIdx;
6666 for (unsigned i = 0; i < 4; ++i) {
6667 if (Zeroable[i])
6668 continue;
6669 SDValue Elt = Op.getOperand(i);
6670 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6671 !isa<ConstantSDNode>(Elt.getOperand(1)))
6672 return SDValue();
6673 // Make sure that this node is extracting from a 128-bit vector.
6674 MVT VT = Elt.getOperand(0).getSimpleValueType();
6675 if (!VT.is128BitVector())
6676 return SDValue();
6677 if (!FirstNonZero.getNode()) {
6678 FirstNonZero = Elt;
6679 FirstNonZeroIdx = i;
6680 }
6681 }
6682
6683 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6684 SDValue V1 = FirstNonZero.getOperand(0);
6685 MVT VT = V1.getSimpleValueType();
6686
6687 // See if this build_vector can be lowered as a blend with zero.
6688 SDValue Elt;
6689 unsigned EltMaskIdx, EltIdx;
6690 int Mask[4];
6691 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6692 if (Zeroable[EltIdx]) {
6693 // The zero vector will be on the right hand side.
6694 Mask[EltIdx] = EltIdx+4;
6695 continue;
6696 }
6697
6698 Elt = Op->getOperand(EltIdx);
6699 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6700 EltMaskIdx = Elt.getConstantOperandVal(1);
6701 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6702 break;
6703 Mask[EltIdx] = EltIdx;
6704 }
6705
6706 if (EltIdx == 4) {
6707 // Let the shuffle legalizer deal with blend operations.
6708 SDValue VZeroOrUndef = (Zeroable == Undefs)
6709 ? DAG.getUNDEF(VT)
6710 : getZeroVector(VT, Subtarget, DAG, DL);
6711 if (V1.getSimpleValueType() != VT)
6712 V1 = DAG.getBitcast(VT, V1);
6713 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6714 }
6715
6716 // See if we can lower this build_vector to a INSERTPS.
6717 if (!Subtarget.hasSSE41())
6718 return SDValue();
6719
6720 SDValue V2 = Elt.getOperand(0);
6721 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6722 V1 = SDValue();
6723
6724 bool CanFold = true;
6725 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6726 if (Zeroable[i])
6727 continue;
6728
6729 SDValue Current = Op->getOperand(i);
6730 SDValue SrcVector = Current->getOperand(0);
6731 if (!V1.getNode())
6732 V1 = SrcVector;
6733 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6734 }
6735
6736 if (!CanFold)
6737 return SDValue();
6738
6739 assert(V1.getNode() && "Expected at least two non-zero elements!");
6740 if (V1.getSimpleValueType() != MVT::v4f32)
6741 V1 = DAG.getBitcast(MVT::v4f32, V1);
6742 if (V2.getSimpleValueType() != MVT::v4f32)
6743 V2 = DAG.getBitcast(MVT::v4f32, V2);
6744
6745 // Ok, we can emit an INSERTPS instruction.
6746 unsigned ZMask = Zeroable.to_ulong();
6747
6748 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6749 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6750 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6751 DAG.getIntPtrConstant(InsertPSMask, DL, true));
6752 return DAG.getBitcast(VT, Result);
6753}
6754
6755/// Return a vector logical shift node.
6756static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6757 SelectionDAG &DAG, const TargetLowering &TLI,
6758 const SDLoc &dl) {
6759 assert(VT.is128BitVector() && "Unknown type for VShift");
6760 MVT ShVT = MVT::v16i8;
6761 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6762 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6763 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6764 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6765 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6766}
6767
6769 SelectionDAG &DAG) {
6770
6771 // Check if the scalar load can be widened into a vector load. And if
6772 // the address is "base + cst" see if the cst can be "absorbed" into
6773 // the shuffle mask.
6774 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6775 SDValue Ptr = LD->getBasePtr();
6776 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
6777 return SDValue();
6778 EVT PVT = LD->getValueType(0);
6779 if (PVT != MVT::i32 && PVT != MVT::f32)
6780 return SDValue();
6781
6782 int FI = -1;
6783 int64_t Offset = 0;
6784 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6785 FI = FINode->getIndex();
6786 Offset = 0;
6787 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6788 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6789 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6790 Offset = Ptr.getConstantOperandVal(1);
6791 Ptr = Ptr.getOperand(0);
6792 } else {
6793 return SDValue();
6794 }
6795
6796 // FIXME: 256-bit vector instructions don't require a strict alignment,
6797 // improve this code to support it better.
6798 Align RequiredAlign(VT.getSizeInBits() / 8);
6799 SDValue Chain = LD->getChain();
6800 // Make sure the stack object alignment is at least 16 or 32.
6802 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
6803 if (!InferredAlign || *InferredAlign < RequiredAlign) {
6804 if (MFI.isFixedObjectIndex(FI)) {
6805 // Can't change the alignment. FIXME: It's possible to compute
6806 // the exact stack offset and reference FI + adjust offset instead.
6807 // If someone *really* cares about this. That's the way to implement it.
6808 return SDValue();
6809 } else {
6810 MFI.setObjectAlignment(FI, RequiredAlign);
6811 }
6812 }
6813
6814 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6815 // Ptr + (Offset & ~15).
6816 if (Offset < 0)
6817 return SDValue();
6818 if ((Offset % RequiredAlign.value()) & 3)
6819 return SDValue();
6820 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
6821 if (StartOffset) {
6822 SDLoc DL(Ptr);
6823 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6824 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6825 }
6826
6827 int EltNo = (Offset - StartOffset) >> 2;
6828 unsigned NumElems = VT.getVectorNumElements();
6829
6830 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6831 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6832 LD->getPointerInfo().getWithOffset(StartOffset));
6833
6834 SmallVector<int, 8> Mask(NumElems, EltNo);
6835
6836 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6837 }
6838
6839 return SDValue();
6840}
6841
6842// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
6843static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
6844 if (ISD::isNON_EXTLoad(Elt.getNode())) {
6845 auto *BaseLd = cast<LoadSDNode>(Elt);
6846 if (!BaseLd->isSimple())
6847 return false;
6848 Ld = BaseLd;
6849 ByteOffset = 0;
6850 return true;
6851 }
6852
6853 switch (Elt.getOpcode()) {
6854 case ISD::BITCAST:
6855 case ISD::TRUNCATE:
6857 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
6858 case ISD::SRL:
6859 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6860 uint64_t Amt = AmtC->getZExtValue();
6861 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
6862 ByteOffset += Amt / 8;
6863 return true;
6864 }
6865 }
6866 break;
6868 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6869 SDValue Src = Elt.getOperand(0);
6870 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
6871 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
6872 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
6873 findEltLoadSrc(Src, Ld, ByteOffset)) {
6874 uint64_t Idx = IdxC->getZExtValue();
6875 ByteOffset += Idx * (SrcSizeInBits / 8);
6876 return true;
6877 }
6878 }
6879 break;
6880 }
6881
6882 return false;
6883}
6884
6885/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6886/// elements can be replaced by a single large load which has the same value as
6887/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6888///
6889/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6891 const SDLoc &DL, SelectionDAG &DAG,
6892 const X86Subtarget &Subtarget,
6893 bool IsAfterLegalize) {
6894 if ((VT.getScalarSizeInBits() % 8) != 0)
6895 return SDValue();
6896
6897 unsigned NumElems = Elts.size();
6898
6899 int LastLoadedElt = -1;
6900 APInt LoadMask = APInt::getZero(NumElems);
6901 APInt ZeroMask = APInt::getZero(NumElems);
6902 APInt UndefMask = APInt::getZero(NumElems);
6903
6904 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
6905 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
6906
6907 // For each element in the initializer, see if we've found a load, zero or an
6908 // undef.
6909 for (unsigned i = 0; i < NumElems; ++i) {
6910 SDValue Elt = peekThroughBitcasts(Elts[i]);
6911 if (!Elt.getNode())
6912 return SDValue();
6913 if (Elt.isUndef()) {
6914 UndefMask.setBit(i);
6915 continue;
6916 }
6918 ZeroMask.setBit(i);
6919 continue;
6920 }
6921
6922 // Each loaded element must be the correct fractional portion of the
6923 // requested vector load.
6924 unsigned EltSizeInBits = Elt.getValueSizeInBits();
6925 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
6926 return SDValue();
6927
6928 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
6929 return SDValue();
6930 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
6931 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
6932 return SDValue();
6933
6934 LoadMask.setBit(i);
6935 LastLoadedElt = i;
6936 }
6937 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
6938 NumElems &&
6939 "Incomplete element masks");
6940
6941 // Handle Special Cases - all undef or undef/zero.
6942 if (UndefMask.popcount() == NumElems)
6943 return DAG.getUNDEF(VT);
6944 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
6945 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6946 : DAG.getConstantFP(0.0, DL, VT);
6947
6948 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6949 int FirstLoadedElt = LoadMask.countr_zero();
6950 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6951 EVT EltBaseVT = EltBase.getValueType();
6952 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
6953 "Register/Memory size mismatch");
6954 LoadSDNode *LDBase = Loads[FirstLoadedElt];
6955 assert(LDBase && "Did not find base load for merging consecutive loads");
6956 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
6957 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
6958 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
6959 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
6960 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
6961
6962 // TODO: Support offsetting the base load.
6963 if (ByteOffsets[FirstLoadedElt] != 0)
6964 return SDValue();
6965
6966 // Check to see if the element's load is consecutive to the base load
6967 // or offset from a previous (already checked) load.
6968 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
6969 LoadSDNode *Ld = Loads[EltIdx];
6970 int64_t ByteOffset = ByteOffsets[EltIdx];
6971 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
6972 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
6973 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
6974 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
6975 }
6976 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
6977 EltIdx - FirstLoadedElt);
6978 };
6979
6980 // Consecutive loads can contain UNDEFS but not ZERO elements.
6981 // Consecutive loads with UNDEFs and ZEROs elements require a
6982 // an additional shuffle stage to clear the ZERO elements.
6983 bool IsConsecutiveLoad = true;
6984 bool IsConsecutiveLoadWithZeros = true;
6985 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6986 if (LoadMask[i]) {
6987 if (!CheckConsecutiveLoad(LDBase, i)) {
6988 IsConsecutiveLoad = false;
6989 IsConsecutiveLoadWithZeros = false;
6990 break;
6991 }
6992 } else if (ZeroMask[i]) {
6993 IsConsecutiveLoad = false;
6994 }
6995 }
6996
6997 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6998 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6999 assert(LDBase->isSimple() &&
7000 "Cannot merge volatile or atomic loads.");
7001 SDValue NewLd =
7002 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7003 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
7004 MMOFlags);
7005 for (auto *LD : Loads)
7006 if (LD)
7007 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7008 return NewLd;
7009 };
7010
7011 // Check if the base load is entirely dereferenceable.
7012 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7013 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7014
7015 // LOAD - all consecutive load/undefs (must start/end with a load or be
7016 // entirely dereferenceable). If we have found an entire vector of loads and
7017 // undefs, then return a large load of the entire vector width starting at the
7018 // base pointer. If the vector contains zeros, then attempt to shuffle those
7019 // elements.
7020 if (FirstLoadedElt == 0 &&
7021 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7022 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7023 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7024 return SDValue();
7025
7026 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7027 // will lower to regular temporal loads and use the cache.
7028 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7029 VT.is256BitVector() && !Subtarget.hasInt256())
7030 return SDValue();
7031
7032 if (NumElems == 1)
7033 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7034
7035 if (!ZeroMask)
7036 return CreateLoad(VT, LDBase);
7037
7038 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7039 // vector and a zero vector to clear out the zero elements.
7040 if (!IsAfterLegalize && VT.isVector()) {
7041 unsigned NumMaskElts = VT.getVectorNumElements();
7042 if ((NumMaskElts % NumElems) == 0) {
7043 unsigned Scale = NumMaskElts / NumElems;
7044 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7045 for (unsigned i = 0; i < NumElems; ++i) {
7046 if (UndefMask[i])
7047 continue;
7048 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7049 for (unsigned j = 0; j != Scale; ++j)
7050 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7051 }
7052 SDValue V = CreateLoad(VT, LDBase);
7053 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7054 : DAG.getConstantFP(0.0, DL, VT);
7055 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7056 }
7057 }
7058 }
7059
7060 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7061 if (VT.is256BitVector() || VT.is512BitVector()) {
7062 unsigned HalfNumElems = NumElems / 2;
7063 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7064 EVT HalfVT =
7065 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7066 SDValue HalfLD =
7067 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7068 DAG, Subtarget, IsAfterLegalize);
7069 if (HalfLD)
7070 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7071 HalfLD, DAG.getIntPtrConstant(0, DL));
7072 }
7073 }
7074
7075 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7076 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7077 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7078 LoadSizeInBits == 64) &&
7079 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7080 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7081 : MVT::getIntegerVT(LoadSizeInBits);
7082 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7083 // Allow v4f32 on SSE1 only targets.
7084 // FIXME: Add more isel patterns so we can just use VT directly.
7085 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7086 VecVT = MVT::v4f32;
7087 if (TLI.isTypeLegal(VecVT)) {
7088 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7089 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7090 SDValue ResNode = DAG.getMemIntrinsicNode(
7091 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7093 for (auto *LD : Loads)
7094 if (LD)
7095 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7096 return DAG.getBitcast(VT, ResNode);
7097 }
7098 }
7099
7100 // BROADCAST - match the smallest possible repetition pattern, load that
7101 // scalar/subvector element and then broadcast to the entire vector.
7102 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7103 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7104 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7105 unsigned RepeatSize = SubElems * BaseSizeInBits;
7106 unsigned ScalarSize = std::min(RepeatSize, 64u);
7107 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7108 continue;
7109
7110 // Don't attempt a 1:N subvector broadcast - it should be caught by
7111 // combineConcatVectorOps, else will cause infinite loops.
7112 if (RepeatSize > ScalarSize && SubElems == 1)
7113 continue;
7114
7115 bool Match = true;
7116 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7117 for (unsigned i = 0; i != NumElems && Match; ++i) {
7118 if (!LoadMask[i])
7119 continue;
7120 SDValue Elt = peekThroughBitcasts(Elts[i]);
7121 if (RepeatedLoads[i % SubElems].isUndef())
7122 RepeatedLoads[i % SubElems] = Elt;
7123 else
7124 Match &= (RepeatedLoads[i % SubElems] == Elt);
7125 }
7126
7127 // We must have loads at both ends of the repetition.
7128 Match &= !RepeatedLoads.front().isUndef();
7129 Match &= !RepeatedLoads.back().isUndef();
7130 if (!Match)
7131 continue;
7132
7133 EVT RepeatVT =
7134 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7135 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7136 : EVT::getFloatingPointVT(ScalarSize);
7137 if (RepeatSize > ScalarSize)
7138 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7139 RepeatSize / ScalarSize);
7140 EVT BroadcastVT =
7141 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7142 VT.getSizeInBits() / ScalarSize);
7143 if (TLI.isTypeLegal(BroadcastVT)) {
7144 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7145 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7146 SDValue Broadcast = RepeatLoad;
7147 if (RepeatSize > ScalarSize) {
7148 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7149 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7150 } else {
7151 if (!Subtarget.hasAVX2() &&
7153 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7154 Subtarget,
7155 /*AssumeSingleUse=*/true))
7156 return SDValue();
7157 Broadcast =
7158 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7159 }
7160 return DAG.getBitcast(VT, Broadcast);
7161 }
7162 }
7163 }
7164 }
7165
7166 return SDValue();
7167}
7168
7169// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7170// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7171// are consecutive, non-overlapping, and in the right order.
7173 SelectionDAG &DAG,
7174 const X86Subtarget &Subtarget,
7175 bool IsAfterLegalize) {
7177 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7178 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7179 Elts.push_back(Elt);
7180 continue;
7181 }
7182 return SDValue();
7183 }
7184 assert(Elts.size() == VT.getVectorNumElements());
7185 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7186 IsAfterLegalize);
7187}
7188
7190 const APInt &Undefs, LLVMContext &C) {
7191 unsigned ScalarSize = VT.getScalarSizeInBits();
7192 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7193
7194 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7195 if (VT.isFloatingPoint()) {
7196 if (ScalarSize == 16)
7197 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7198 if (ScalarSize == 32)
7199 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7200 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7201 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7202 }
7203 return Constant::getIntegerValue(Ty, Val);
7204 };
7205
7206 SmallVector<Constant *, 32> ConstantVec;
7207 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7208 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7209 : getConstantScalar(Bits[I]));
7210
7211 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7212}
7213
7214static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7215 unsigned SplatBitSize, LLVMContext &C) {
7216 unsigned ScalarSize = VT.getScalarSizeInBits();
7217
7218 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7219 if (VT.isFloatingPoint()) {
7220 if (ScalarSize == 16)
7221 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7222 if (ScalarSize == 32)
7223 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7224 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7225 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7226 }
7227 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7228 };
7229
7230 if (ScalarSize == SplatBitSize)
7231 return getConstantScalar(SplatValue);
7232
7233 unsigned NumElm = SplatBitSize / ScalarSize;
7234 SmallVector<Constant *, 32> ConstantVec;
7235 for (unsigned I = 0; I != NumElm; ++I) {
7236 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7237 ConstantVec.push_back(getConstantScalar(Val));
7238 }
7239 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7240}
7241
7243 for (auto *U : N->uses()) {
7244 unsigned Opc = U->getOpcode();
7245 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7246 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7247 return false;
7248 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7249 return false;
7250 if (isTargetShuffle(Opc))
7251 return true;
7252 if (Opc == ISD::BITCAST) // Ignore bitcasts
7253 return isFoldableUseOfShuffle(U);
7254 if (N->hasOneUse()) {
7255 // TODO, there may be some general way to know if a SDNode can
7256 // be folded. We now only know whether an MI is foldable.
7257 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7258 return false;
7259 return true;
7260 }
7261 }
7262 return false;
7263}
7264
7265/// Attempt to use the vbroadcast instruction to generate a splat value
7266/// from a splat BUILD_VECTOR which uses:
7267/// a. A single scalar load, or a constant.
7268/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7269///
7270/// The VBROADCAST node is returned when a pattern is found,
7271/// or SDValue() otherwise.
7273 const SDLoc &dl,
7274 const X86Subtarget &Subtarget,
7275 SelectionDAG &DAG) {
7276 // VBROADCAST requires AVX.
7277 // TODO: Splats could be generated for non-AVX CPUs using SSE
7278 // instructions, but there's less potential gain for only 128-bit vectors.
7279 if (!Subtarget.hasAVX())
7280 return SDValue();
7281
7282 MVT VT = BVOp->getSimpleValueType(0);
7283 unsigned NumElts = VT.getVectorNumElements();
7284 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7285 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7286 "Unsupported vector type for broadcast.");
7287
7288 // See if the build vector is a repeating sequence of scalars (inc. splat).
7289 SDValue Ld;
7290 BitVector UndefElements;
7291 SmallVector<SDValue, 16> Sequence;
7292 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7293 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7294 if (Sequence.size() == 1)
7295 Ld = Sequence[0];
7296 }
7297
7298 // Attempt to use VBROADCASTM
7299 // From this pattern:
7300 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7301 // b. t1 = (build_vector t0 t0)
7302 //
7303 // Create (VBROADCASTM v2i1 X)
7304 if (!Sequence.empty() && Subtarget.hasCDI()) {
7305 // If not a splat, are the upper sequence values zeroable?
7306 unsigned SeqLen = Sequence.size();
7307 bool UpperZeroOrUndef =
7308 SeqLen == 1 ||
7309 llvm::all_of(ArrayRef(Sequence).drop_front(),
7310 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7311 SDValue Op0 = Sequence[0];
7312 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7313 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7314 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7315 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7316 ? Op0.getOperand(0)
7317 : Op0.getOperand(0).getOperand(0);
7318 MVT MaskVT = BOperand.getSimpleValueType();
7319 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7320 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7321 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7322 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7323 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7324 unsigned Scale = 512 / VT.getSizeInBits();
7325 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7326 }
7327 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7328 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7329 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7330 return DAG.getBitcast(VT, Bcst);
7331 }
7332 }
7333 }
7334
7335 unsigned NumUndefElts = UndefElements.count();
7336 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7337 APInt SplatValue, Undef;
7338 unsigned SplatBitSize;
7339 bool HasUndef;
7340 // Check if this is a repeated constant pattern suitable for broadcasting.
7341 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7342 SplatBitSize > VT.getScalarSizeInBits() &&
7343 SplatBitSize < VT.getSizeInBits()) {
7344 // Avoid replacing with broadcast when it's a use of a shuffle
7345 // instruction to preserve the present custom lowering of shuffles.
7346 if (isFoldableUseOfShuffle(BVOp))
7347 return SDValue();
7348 // replace BUILD_VECTOR with broadcast of the repeated constants.
7349 LLVMContext *Ctx = DAG.getContext();
7350 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7351 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7352 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7353 // Load the constant scalar/subvector and broadcast it.
7354 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7355 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7356 SDValue CP = DAG.getConstantPool(C, PVT);
7357 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7358
7359 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7360 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7361 SDValue Ops[] = {DAG.getEntryNode(), CP};
7362 MachinePointerInfo MPI =
7364 SDValue Brdcst =
7365 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7366 MPI, Alignment, MachineMemOperand::MOLoad);
7367 return DAG.getBitcast(VT, Brdcst);
7368 }
7369 if (SplatBitSize > 64) {
7370 // Load the vector of constants and broadcast it.
7371 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7372 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7373 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7374 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7375 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7376 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7377 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7378 MachinePointerInfo MPI =
7381 Ops, VVT, MPI, Alignment,
7383 }
7384 }
7385
7386 // If we are moving a scalar into a vector (Ld must be set and all elements
7387 // but 1 are undef) and that operation is not obviously supported by
7388 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7389 // That's better than general shuffling and may eliminate a load to GPR and
7390 // move from scalar to vector register.
7391 if (!Ld || NumElts - NumUndefElts != 1)
7392 return SDValue();
7393 unsigned ScalarSize = Ld.getValueSizeInBits();
7394 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7395 return SDValue();
7396 }
7397
7398 bool ConstSplatVal =
7399 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7400 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7401
7402 // TODO: Handle broadcasts of non-constant sequences.
7403
7404 // Make sure that all of the users of a non-constant load are from the
7405 // BUILD_VECTOR node.
7406 // FIXME: Is the use count needed for non-constant, non-load case?
7407 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7408 return SDValue();
7409
7410 unsigned ScalarSize = Ld.getValueSizeInBits();
7411 bool IsGE256 = (VT.getSizeInBits() >= 256);
7412
7413 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7414 // instruction to save 8 or more bytes of constant pool data.
7415 // TODO: If multiple splats are generated to load the same constant,
7416 // it may be detrimental to overall size. There needs to be a way to detect
7417 // that condition to know if this is truly a size win.
7418 bool OptForSize = DAG.shouldOptForSize();
7419
7420 // Handle broadcasting a single constant scalar from the constant pool
7421 // into a vector.
7422 // On Sandybridge (no AVX2), it is still better to load a constant vector
7423 // from the constant pool and not to broadcast it from a scalar.
7424 // But override that restriction when optimizing for size.
7425 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7426 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7427 EVT CVT = Ld.getValueType();
7428 assert(!CVT.isVector() && "Must not broadcast a vector type");
7429
7430 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7431 // For size optimization, also splat v2f64 and v2i64, and for size opt
7432 // with AVX2, also splat i8 and i16.
7433 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7434 if (ScalarSize == 32 ||
7435 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7436 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7437 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7438 const Constant *C = nullptr;
7439 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7440 C = CI->getConstantIntValue();
7441 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7442 C = CF->getConstantFPValue();
7443
7444 assert(C && "Invalid constant type");
7445
7446 SDValue CP =
7448 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7449
7450 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7451 SDValue Ops[] = {DAG.getEntryNode(), CP};
7452 MachinePointerInfo MPI =
7454 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7455 MPI, Alignment, MachineMemOperand::MOLoad);
7456 }
7457 }
7458
7459 // Handle AVX2 in-register broadcasts.
7460 if (!IsLoad && Subtarget.hasInt256() &&
7461 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7462 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7463
7464 // The scalar source must be a normal load.
7465 if (!IsLoad)
7466 return SDValue();
7467
7468 // Make sure the non-chain result is only used by this build vector.
7469 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7470 return SDValue();
7471
7472 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7473 (Subtarget.hasVLX() && ScalarSize == 64)) {
7474 auto *LN = cast<LoadSDNode>(Ld);
7475 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7476 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7477 SDValue BCast =
7479 LN->getMemoryVT(), LN->getMemOperand());
7480 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7481 return BCast;
7482 }
7483
7484 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7485 // double since there is no vbroadcastsd xmm
7486 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7487 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7488 auto *LN = cast<LoadSDNode>(Ld);
7489 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7490 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7491 SDValue BCast =
7493 LN->getMemoryVT(), LN->getMemOperand());
7494 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7495 return BCast;
7496 }
7497
7498 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7499 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7500
7501 // Unsupported broadcast.
7502 return SDValue();
7503}
7504
7505/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7506/// underlying vector and index.
7507///
7508/// Modifies \p ExtractedFromVec to the real vector and returns the real
7509/// index.
7510static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7511 SDValue ExtIdx) {
7512 int Idx = ExtIdx->getAsZExtVal();
7513 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7514 return Idx;
7515
7516 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7517 // lowered this:
7518 // (extract_vector_elt (v8f32 %1), Constant<6>)
7519 // to:
7520 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7521 // (extract_subvector (v8f32 %0), Constant<4>),
7522 // undef)
7523 // Constant<0>)
7524 // In this case the vector is the extract_subvector expression and the index
7525 // is 2, as specified by the shuffle.
7526 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7527 SDValue ShuffleVec = SVOp->getOperand(0);
7528 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7529 assert(ShuffleVecVT.getVectorElementType() ==
7530 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7531
7532 int ShuffleIdx = SVOp->getMaskElt(Idx);
7533 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7534 ExtractedFromVec = ShuffleVec;
7535 return ShuffleIdx;
7536 }
7537 return Idx;
7538}
7539
7541 SelectionDAG &DAG) {
7542 MVT VT = Op.getSimpleValueType();
7543
7544 // Skip if insert_vec_elt is not supported.
7545 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7547 return SDValue();
7548
7549 unsigned NumElems = Op.getNumOperands();
7550 SDValue VecIn1;
7551 SDValue VecIn2;
7552 SmallVector<unsigned, 4> InsertIndices;
7553 SmallVector<int, 8> Mask(NumElems, -1);
7554
7555 for (unsigned i = 0; i != NumElems; ++i) {
7556 unsigned Opc = Op.getOperand(i).getOpcode();
7557
7558 if (Opc == ISD::UNDEF)
7559 continue;
7560
7561 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7562 // Quit if more than 1 elements need inserting.
7563 if (InsertIndices.size() > 1)
7564 return SDValue();
7565
7566 InsertIndices.push_back(i);
7567 continue;
7568 }
7569
7570 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7571 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7572
7573 // Quit if non-constant index.
7574 if (!isa<ConstantSDNode>(ExtIdx))
7575 return SDValue();
7576 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7577
7578 // Quit if extracted from vector of different type.
7579 if (ExtractedFromVec.getValueType() != VT)
7580 return SDValue();
7581
7582 if (!VecIn1.getNode())
7583 VecIn1 = ExtractedFromVec;
7584 else if (VecIn1 != ExtractedFromVec) {
7585 if (!VecIn2.getNode())
7586 VecIn2 = ExtractedFromVec;
7587 else if (VecIn2 != ExtractedFromVec)
7588 // Quit if more than 2 vectors to shuffle
7589 return SDValue();
7590 }
7591
7592 if (ExtractedFromVec == VecIn1)
7593 Mask[i] = Idx;
7594 else if (ExtractedFromVec == VecIn2)
7595 Mask[i] = Idx + NumElems;
7596 }
7597
7598 if (!VecIn1.getNode())
7599 return SDValue();
7600
7601 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7602 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7603
7604 for (unsigned Idx : InsertIndices)
7605 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7606 DAG.getIntPtrConstant(Idx, DL));
7607
7608 return NV;
7609}
7610
7611// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7613 const X86Subtarget &Subtarget) {
7614 MVT VT = Op.getSimpleValueType();
7615 MVT IVT =
7616 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7618 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7619 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7620 Op.getOperand(I)));
7621 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7622 return DAG.getBitcast(VT, Res);
7623}
7624
7625// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7627 SelectionDAG &DAG,
7628 const X86Subtarget &Subtarget) {
7629
7630 MVT VT = Op.getSimpleValueType();
7631 assert((VT.getVectorElementType() == MVT::i1) &&
7632 "Unexpected type in LowerBUILD_VECTORvXi1!");
7633 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7634 ISD::isBuildVectorAllOnes(Op.getNode()))
7635 return Op;
7636
7637 uint64_t Immediate = 0;
7638 SmallVector<unsigned, 16> NonConstIdx;
7639 bool IsSplat = true;
7640 bool HasConstElts = false;
7641 int SplatIdx = -1;
7642 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7643 SDValue In = Op.getOperand(idx);
7644 if (In.isUndef())
7645 continue;
7646 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7647 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7648 HasConstElts = true;
7649 } else {
7650 NonConstIdx.push_back(idx);
7651 }
7652 if (SplatIdx < 0)
7653 SplatIdx = idx;
7654 else if (In != Op.getOperand(SplatIdx))
7655 IsSplat = false;
7656 }
7657
7658 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7659 if (IsSplat) {
7660 // The build_vector allows the scalar element to be larger than the vector
7661 // element type. We need to mask it to use as a condition unless we know
7662 // the upper bits are zero.
7663 // FIXME: Use computeKnownBits instead of checking specific opcode?
7664 SDValue Cond = Op.getOperand(SplatIdx);
7665 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7666 if (Cond.getOpcode() != ISD::SETCC)
7667 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7668 DAG.getConstant(1, dl, MVT::i8));
7669
7670 // Perform the select in the scalar domain so we can use cmov.
7671 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7672 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7673 DAG.getAllOnesConstant(dl, MVT::i32),
7674 DAG.getConstant(0, dl, MVT::i32));
7675 Select = DAG.getBitcast(MVT::v32i1, Select);
7676 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7677 } else {
7678 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7679 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7680 DAG.getAllOnesConstant(dl, ImmVT),
7681 DAG.getConstant(0, dl, ImmVT));
7682 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7683 Select = DAG.getBitcast(VecVT, Select);
7684 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7685 DAG.getIntPtrConstant(0, dl));
7686 }
7687 }
7688
7689 // insert elements one by one
7690 SDValue DstVec;
7691 if (HasConstElts) {
7692 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7693 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7694 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7695 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7696 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7697 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7698 } else {
7699 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7700 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7701 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7702 DstVec = DAG.getBitcast(VecVT, Imm);
7703 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7704 DAG.getIntPtrConstant(0, dl));
7705 }
7706 } else
7707 DstVec = DAG.getUNDEF(VT);
7708
7709 for (unsigned InsertIdx : NonConstIdx) {
7710 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7711 Op.getOperand(InsertIdx),
7712 DAG.getIntPtrConstant(InsertIdx, dl));
7713 }
7714 return DstVec;
7715}
7716
7717LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7718 switch (Opcode) {
7719 case X86ISD::PACKSS:
7720 case X86ISD::PACKUS:
7721 case X86ISD::FHADD:
7722 case X86ISD::FHSUB:
7723 case X86ISD::HADD:
7724 case X86ISD::HSUB:
7725 return true;
7726 }
7727 return false;
7728}
7729
7730/// This is a helper function of LowerToHorizontalOp().
7731/// This function checks that the build_vector \p N in input implements a
7732/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7733/// may not match the layout of an x86 256-bit horizontal instruction.
7734/// In other words, if this returns true, then some extraction/insertion will
7735/// be required to produce a valid horizontal instruction.
7736///
7737/// Parameter \p Opcode defines the kind of horizontal operation to match.
7738/// For example, if \p Opcode is equal to ISD::ADD, then this function
7739/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7740/// is equal to ISD::SUB, then this function checks if this is a horizontal
7741/// arithmetic sub.
7742///
7743/// This function only analyzes elements of \p N whose indices are
7744/// in range [BaseIdx, LastIdx).
7745///
7746/// TODO: This function was originally used to match both real and fake partial
7747/// horizontal operations, but the index-matching logic is incorrect for that.
7748/// See the corrected implementation in isHopBuildVector(). Can we reduce this
7749/// code because it is only used for partial h-op matching now?
7750static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7751 const SDLoc &DL, SelectionDAG &DAG,
7752 unsigned BaseIdx, unsigned LastIdx,
7753 SDValue &V0, SDValue &V1) {
7754 EVT VT = N->getValueType(0);
7755 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7756 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7757 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7758 "Invalid Vector in input!");
7759
7760 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7761 bool CanFold = true;
7762 unsigned ExpectedVExtractIdx = BaseIdx;
7763 unsigned NumElts = LastIdx - BaseIdx;
7764 V0 = DAG.getUNDEF(VT);
7765 V1 = DAG.getUNDEF(VT);
7766
7767 // Check if N implements a horizontal binop.
7768 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7769 SDValue Op = N->getOperand(i + BaseIdx);
7770
7771 // Skip UNDEFs.
7772 if (Op->isUndef()) {
7773 // Update the expected vector extract index.
7774 if (i * 2 == NumElts)
7775 ExpectedVExtractIdx = BaseIdx;
7776 ExpectedVExtractIdx += 2;
7777 continue;
7778 }
7779
7780 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7781
7782 if (!CanFold)
7783 break;
7784
7785 SDValue Op0 = Op.getOperand(0);
7786 SDValue Op1 = Op.getOperand(1);
7787
7788 // Try to match the following pattern:
7789 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7790 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7792 Op0.getOperand(0) == Op1.getOperand(0) &&
7793 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7794 isa<ConstantSDNode>(Op1.getOperand(1)));
7795 if (!CanFold)
7796 break;
7797
7798 unsigned I0 = Op0.getConstantOperandVal(1);
7799 unsigned I1 = Op1.getConstantOperandVal(1);
7800
7801 if (i * 2 < NumElts) {
7802 if (V0.isUndef()) {
7803 V0 = Op0.getOperand(0);
7804 if (V0.getValueType() != VT)
7805 return false;
7806 }
7807 } else {
7808 if (V1.isUndef()) {
7809 V1 = Op0.getOperand(0);
7810 if (V1.getValueType() != VT)
7811 return false;
7812 }
7813 if (i * 2 == NumElts)
7814 ExpectedVExtractIdx = BaseIdx;
7815 }
7816
7817 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7818 if (I0 == ExpectedVExtractIdx)
7819 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7820 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7821 // Try to match the following dag sequence:
7822 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7823 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7824 } else
7825 CanFold = false;
7826
7827 ExpectedVExtractIdx += 2;
7828 }
7829
7830 return CanFold;
7831}
7832
7833/// Emit a sequence of two 128-bit horizontal add/sub followed by
7834/// a concat_vector.
7835///
7836/// This is a helper function of LowerToHorizontalOp().
7837/// This function expects two 256-bit vectors called V0 and V1.
7838/// At first, each vector is split into two separate 128-bit vectors.
7839/// Then, the resulting 128-bit vectors are used to implement two
7840/// horizontal binary operations.
7841///
7842/// The kind of horizontal binary operation is defined by \p X86Opcode.
7843///
7844/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7845/// the two new horizontal binop.
7846/// When Mode is set, the first horizontal binop dag node would take as input
7847/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7848/// horizontal binop dag node would take as input the lower 128-bit of V1
7849/// and the upper 128-bit of V1.
7850/// Example:
7851/// HADD V0_LO, V0_HI
7852/// HADD V1_LO, V1_HI
7853///
7854/// Otherwise, the first horizontal binop dag node takes as input the lower
7855/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7856/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7857/// Example:
7858/// HADD V0_LO, V1_LO
7859/// HADD V0_HI, V1_HI
7860///
7861/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7862/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7863/// the upper 128-bits of the result.
7864static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7865 const SDLoc &DL, SelectionDAG &DAG,
7866 unsigned X86Opcode, bool Mode,
7867 bool isUndefLO, bool isUndefHI) {
7868 MVT VT = V0.getSimpleValueType();
7869 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7870 "Invalid nodes in input!");
7871
7872 unsigned NumElts = VT.getVectorNumElements();
7873 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7874 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7875 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7876 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7877 MVT NewVT = V0_LO.getSimpleValueType();
7878
7879 SDValue LO = DAG.getUNDEF(NewVT);
7880 SDValue HI = DAG.getUNDEF(NewVT);
7881
7882 if (Mode) {
7883 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7884 if (!isUndefLO && !V0->isUndef())
7885 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7886 if (!isUndefHI && !V1->isUndef())
7887 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7888 } else {
7889 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7890 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7891 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7892
7893 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7894 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7895 }
7896
7897 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7898}
7899
7900/// Returns true iff \p BV builds a vector with the result equivalent to
7901/// the result of ADDSUB/SUBADD operation.
7902/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7903/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7904/// \p Opnd0 and \p Opnd1.
7906 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7907 SDValue &Opnd0, SDValue &Opnd1,
7908 unsigned &NumExtracts,
7909 bool &IsSubAdd) {
7910
7911 MVT VT = BV->getSimpleValueType(0);
7912 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7913 return false;
7914
7915 unsigned NumElts = VT.getVectorNumElements();
7916 SDValue InVec0 = DAG.getUNDEF(VT);
7917 SDValue InVec1 = DAG.getUNDEF(VT);
7918
7919 NumExtracts = 0;
7920
7921 // Odd-numbered elements in the input build vector are obtained from
7922 // adding/subtracting two integer/float elements.
7923 // Even-numbered elements in the input build vector are obtained from
7924 // subtracting/adding two integer/float elements.
7925 unsigned Opc[2] = {0, 0};
7926 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7927 SDValue Op = BV->getOperand(i);
7928
7929 // Skip 'undef' values.
7930 unsigned Opcode = Op.getOpcode();
7931 if (Opcode == ISD::UNDEF)
7932 continue;
7933
7934 // Early exit if we found an unexpected opcode.
7935 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7936 return false;
7937
7938 SDValue Op0 = Op.getOperand(0);
7939 SDValue Op1 = Op.getOperand(1);
7940
7941 // Try to match the following pattern:
7942 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7943 // Early exit if we cannot match that sequence.
7944 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7946 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7947 Op0.getOperand(1) != Op1.getOperand(1))
7948 return false;
7949
7950 unsigned I0 = Op0.getConstantOperandVal(1);
7951 if (I0 != i)
7952 return false;
7953
7954 // We found a valid add/sub node, make sure its the same opcode as previous
7955 // elements for this parity.
7956 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7957 return false;
7958 Opc[i % 2] = Opcode;
7959
7960 // Update InVec0 and InVec1.
7961 if (InVec0.isUndef()) {
7962 InVec0 = Op0.getOperand(0);
7963 if (InVec0.getSimpleValueType() != VT)
7964 return false;
7965 }
7966 if (InVec1.isUndef()) {
7967 InVec1 = Op1.getOperand(0);
7968 if (InVec1.getSimpleValueType() != VT)
7969 return false;
7970 }
7971
7972 // Make sure that operands in input to each add/sub node always
7973 // come from a same pair of vectors.
7974 if (InVec0 != Op0.getOperand(0)) {
7975 if (Opcode == ISD::FSUB)
7976 return false;
7977
7978 // FADD is commutable. Try to commute the operands
7979 // and then test again.
7980 std::swap(Op0, Op1);
7981 if (InVec0 != Op0.getOperand(0))
7982 return false;
7983 }
7984
7985 if (InVec1 != Op1.getOperand(0))
7986 return false;
7987
7988 // Increment the number of extractions done.
7989 ++NumExtracts;
7990 }
7991
7992 // Ensure we have found an opcode for both parities and that they are
7993 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7994 // inputs are undef.
7995 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7996 InVec0.isUndef() || InVec1.isUndef())
7997 return false;
7998
7999 IsSubAdd = Opc[0] == ISD::FADD;
8000
8001 Opnd0 = InVec0;
8002 Opnd1 = InVec1;
8003 return true;
8004}
8005
8006/// Returns true if is possible to fold MUL and an idiom that has already been
8007/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8008/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8009/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8010///
8011/// Prior to calling this function it should be known that there is some
8012/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8013/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8014/// before replacement of such SDNode with ADDSUB operation. Thus the number
8015/// of \p Opnd0 uses is expected to be equal to 2.
8016/// For example, this function may be called for the following IR:
8017/// %AB = fmul fast <2 x double> %A, %B
8018/// %Sub = fsub fast <2 x double> %AB, %C
8019/// %Add = fadd fast <2 x double> %AB, %C
8020/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8021/// <2 x i32> <i32 0, i32 3>
8022/// There is a def for %Addsub here, which potentially can be replaced by
8023/// X86ISD::ADDSUB operation:
8024/// %Addsub = X86ISD::ADDSUB %AB, %C
8025/// and such ADDSUB can further be replaced with FMADDSUB:
8026/// %Addsub = FMADDSUB %A, %B, %C.
8027///
8028/// The main reason why this method is called before the replacement of the
8029/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8030/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8031/// FMADDSUB is.
8032static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8033 SelectionDAG &DAG,
8034 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8035 unsigned ExpectedUses) {
8036 if (Opnd0.getOpcode() != ISD::FMUL ||
8037 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8038 return false;
8039
8040 // FIXME: These checks must match the similar ones in
8041 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8042 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8043 // or MUL + ADDSUB to FMADDSUB.
8044 const TargetOptions &Options = DAG.getTarget().Options;
8045 bool AllowFusion =
8046 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8047 if (!AllowFusion)
8048 return false;
8049
8050 Opnd2 = Opnd1;
8051 Opnd1 = Opnd0.getOperand(1);
8052 Opnd0 = Opnd0.getOperand(0);
8053
8054 return true;
8055}
8056
8057/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8058/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8059/// X86ISD::FMSUBADD node.
8061 const SDLoc &DL,
8062 const X86Subtarget &Subtarget,
8063 SelectionDAG &DAG) {
8064 SDValue Opnd0, Opnd1;
8065 unsigned NumExtracts;
8066 bool IsSubAdd;
8067 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8068 IsSubAdd))
8069 return SDValue();
8070
8071 MVT VT = BV->getSimpleValueType(0);
8072
8073 // Try to generate X86ISD::FMADDSUB node here.
8074 SDValue Opnd2;
8075 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8076 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8077 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8078 }
8079
8080 // We only support ADDSUB.
8081 if (IsSubAdd)
8082 return SDValue();
8083
8084 // There are no known X86 targets with 512-bit ADDSUB instructions!
8085 // Convert to blend(fsub,fadd).
8086 if (VT.is512BitVector()) {
8087 SmallVector<int> Mask;
8088 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8089 Mask.push_back(I);
8090 Mask.push_back(I + E + 1);
8091 }
8092 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8093 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8094 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8095 }
8096
8097 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8098}
8099
8101 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8102 // Initialize outputs to known values.
8103 MVT VT = BV->getSimpleValueType(0);
8104 HOpcode = ISD::DELETED_NODE;
8105 V0 = DAG.getUNDEF(VT);
8106 V1 = DAG.getUNDEF(VT);
8107
8108 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8109 // half of the result is calculated independently from the 128-bit halves of
8110 // the inputs, so that makes the index-checking logic below more complicated.
8111 unsigned NumElts = VT.getVectorNumElements();
8112 unsigned GenericOpcode = ISD::DELETED_NODE;
8113 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8114 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8115 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8116 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8117 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8118 // Ignore undef elements.
8119 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8120 if (Op.isUndef())
8121 continue;
8122
8123 // If there's an opcode mismatch, we're done.
8124 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8125 return false;
8126
8127 // Initialize horizontal opcode.
8128 if (HOpcode == ISD::DELETED_NODE) {
8129 GenericOpcode = Op.getOpcode();
8130 switch (GenericOpcode) {
8131 // clang-format off
8132 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8133 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8134 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8135 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8136 default: return false;
8137 // clang-format on
8138 }
8139 }
8140
8141 SDValue Op0 = Op.getOperand(0);
8142 SDValue Op1 = Op.getOperand(1);
8143 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8145 Op0.getOperand(0) != Op1.getOperand(0) ||
8146 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8147 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8148 return false;
8149
8150 // The source vector is chosen based on which 64-bit half of the
8151 // destination vector is being calculated.
8152 if (j < NumEltsIn64Bits) {
8153 if (V0.isUndef())
8154 V0 = Op0.getOperand(0);
8155 } else {
8156 if (V1.isUndef())
8157 V1 = Op0.getOperand(0);
8158 }
8159
8160 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8161 if (SourceVec != Op0.getOperand(0))
8162 return false;
8163
8164 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8165 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8166 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8167 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8168 (j % NumEltsIn64Bits) * 2;
8169 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8170 continue;
8171
8172 // If this is not a commutative op, this does not match.
8173 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8174 return false;
8175
8176 // Addition is commutative, so try swapping the extract indexes.
8177 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8178 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8179 continue;
8180
8181 // Extract indexes do not match horizontal requirement.
8182 return false;
8183 }
8184 }
8185 // We matched. Opcode and operands are returned by reference as arguments.
8186 return true;
8187}
8188
8190 const SDLoc &DL, SelectionDAG &DAG,
8191 unsigned HOpcode, SDValue V0, SDValue V1) {
8192 // If either input vector is not the same size as the build vector,
8193 // extract/insert the low bits to the correct size.
8194 // This is free (examples: zmm --> xmm, xmm --> ymm).
8195 MVT VT = BV->getSimpleValueType(0);
8196 unsigned Width = VT.getSizeInBits();
8197 if (V0.getValueSizeInBits() > Width)
8198 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8199 else if (V0.getValueSizeInBits() < Width)
8200 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8201
8202 if (V1.getValueSizeInBits() > Width)
8203 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8204 else if (V1.getValueSizeInBits() < Width)
8205 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8206
8207 unsigned NumElts = VT.getVectorNumElements();
8208 APInt DemandedElts = APInt::getAllOnes(NumElts);
8209 for (unsigned i = 0; i != NumElts; ++i)
8210 if (BV->getOperand(i).isUndef())
8211 DemandedElts.clearBit(i);
8212
8213 // If we don't need the upper xmm, then perform as a xmm hop.
8214 unsigned HalfNumElts = NumElts / 2;
8215 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8216 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8217 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8218 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8219 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8220 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8221 }
8222
8223 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8224}
8225
8226/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8228 const X86Subtarget &Subtarget,
8229 SelectionDAG &DAG) {
8230 // We need at least 2 non-undef elements to make this worthwhile by default.
8231 unsigned NumNonUndefs =
8232 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8233 if (NumNonUndefs < 2)
8234 return SDValue();
8235
8236 // There are 4 sets of horizontal math operations distinguished by type:
8237 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8238 // subtarget feature. Try to match those "native" patterns first.
8239 MVT VT = BV->getSimpleValueType(0);
8240 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8241 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8242 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8243 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8244 unsigned HOpcode;
8245 SDValue V0, V1;
8246 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8247 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8248 }
8249
8250 // Try harder to match 256-bit ops by using extract/concat.
8251 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8252 return SDValue();
8253
8254 // Count the number of UNDEF operands in the build_vector in input.
8255 unsigned NumElts = VT.getVectorNumElements();
8256 unsigned Half = NumElts / 2;
8257 unsigned NumUndefsLO = 0;
8258 unsigned NumUndefsHI = 0;
8259 for (unsigned i = 0, e = Half; i != e; ++i)
8260 if (BV->getOperand(i)->isUndef())
8261 NumUndefsLO++;
8262
8263 for (unsigned i = Half, e = NumElts; i != e; ++i)
8264 if (BV->getOperand(i)->isUndef())
8265 NumUndefsHI++;
8266
8267 SDValue InVec0, InVec1;
8268 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8269 SDValue InVec2, InVec3;
8270 unsigned X86Opcode;
8271 bool CanFold = true;
8272
8273 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8274 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8275 InVec3) &&
8276 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8277 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8278 X86Opcode = X86ISD::HADD;
8279 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8280 InVec1) &&
8281 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8282 InVec3) &&
8283 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8284 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8285 X86Opcode = X86ISD::HSUB;
8286 else
8287 CanFold = false;
8288
8289 if (CanFold) {
8290 // Do not try to expand this build_vector into a pair of horizontal
8291 // add/sub if we can emit a pair of scalar add/sub.
8292 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8293 return SDValue();
8294
8295 // Convert this build_vector into a pair of horizontal binops followed by
8296 // a concat vector. We must adjust the outputs from the partial horizontal
8297 // matching calls above to account for undefined vector halves.
8298 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8299 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8300 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8301 bool isUndefLO = NumUndefsLO == Half;
8302 bool isUndefHI = NumUndefsHI == Half;
8303 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8304 isUndefHI);
8305 }
8306 }
8307
8308 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8309 VT == MVT::v16i16) {
8310 unsigned X86Opcode;
8311 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8312 InVec1))
8313 X86Opcode = X86ISD::HADD;
8314 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8315 InVec1))
8316 X86Opcode = X86ISD::HSUB;
8317 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8318 InVec1))
8319 X86Opcode = X86ISD::FHADD;
8320 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8321 InVec1))
8322 X86Opcode = X86ISD::FHSUB;
8323 else
8324 return SDValue();
8325
8326 // Don't try to expand this build_vector into a pair of horizontal add/sub
8327 // if we can simply emit a pair of scalar add/sub.
8328 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8329 return SDValue();
8330
8331 // Convert this build_vector into two horizontal add/sub followed by
8332 // a concat vector.
8333 bool isUndefLO = NumUndefsLO == Half;
8334 bool isUndefHI = NumUndefsHI == Half;
8335 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8336 isUndefLO, isUndefHI);
8337 }
8338
8339 return SDValue();
8340}
8341
8342static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8343 SelectionDAG &DAG);
8344
8345/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8346/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8347/// just apply the bit to the vectors.
8348/// NOTE: Its not in our interest to start make a general purpose vectorizer
8349/// from this, but enough scalar bit operations are created from the later
8350/// legalization + scalarization stages to need basic support.
8352 const X86Subtarget &Subtarget,
8353 SelectionDAG &DAG) {
8354 MVT VT = Op->getSimpleValueType(0);
8355 unsigned NumElems = VT.getVectorNumElements();
8356 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8357
8358 // Check that all elements have the same opcode.
8359 // TODO: Should we allow UNDEFS and if so how many?
8360 unsigned Opcode = Op->getOperand(0).getOpcode();
8361 for (unsigned i = 1; i < NumElems; ++i)
8362 if (Opcode != Op->getOperand(i).getOpcode())
8363 return SDValue();
8364
8365 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8366 bool IsShift = false;
8367 switch (Opcode) {
8368 default:
8369 return SDValue();
8370 case ISD::SHL:
8371 case ISD::SRL:
8372 case ISD::SRA:
8373 IsShift = true;
8374 break;
8375 case ISD::AND:
8376 case ISD::XOR:
8377 case ISD::OR:
8378 // Don't do this if the buildvector is a splat - we'd replace one
8379 // constant with an entire vector.
8380 if (Op->getSplatValue())
8381 return SDValue();
8382 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8383 return SDValue();
8384 break;
8385 }
8386
8387 SmallVector<SDValue, 4> LHSElts, RHSElts;
8388 for (SDValue Elt : Op->ops()) {
8389 SDValue LHS = Elt.getOperand(0);
8390 SDValue RHS = Elt.getOperand(1);
8391
8392 // We expect the canonicalized RHS operand to be the constant.
8393 if (!isa<ConstantSDNode>(RHS))
8394 return SDValue();
8395
8396 // Extend shift amounts.
8397 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8398 if (!IsShift)
8399 return SDValue();
8400 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8401 }
8402
8403 LHSElts.push_back(LHS);
8404 RHSElts.push_back(RHS);
8405 }
8406
8407 // Limit to shifts by uniform immediates.
8408 // TODO: Only accept vXi8/vXi64 special cases?
8409 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8410 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8411 return SDValue();
8412
8413 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8414 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8415 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8416
8417 if (!IsShift)
8418 return Res;
8419
8420 // Immediately lower the shift to ensure the constant build vector doesn't
8421 // get converted to a constant pool before the shift is lowered.
8422 return LowerShift(Res, Subtarget, DAG);
8423}
8424
8425/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8426/// functionality to do this, so it's all zeros, all ones, or some derivation
8427/// that is cheap to calculate.
8429 SelectionDAG &DAG,
8430 const X86Subtarget &Subtarget) {
8431 MVT VT = Op.getSimpleValueType();
8432
8433 // Vectors containing all zeros can be matched by pxor and xorps.
8434 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8435 return Op;
8436
8437 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8438 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8439 // vpcmpeqd on 256-bit vectors.
8440 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8441 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8442 return Op;
8443
8444 return getOnesVector(VT, DAG, DL);
8445 }
8446
8447 return SDValue();
8448}
8449
8450/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8451/// from a vector of source values and a vector of extraction indices.
8452/// The vectors might be manipulated to match the type of the permute op.
8453static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8454 const SDLoc &DL, SelectionDAG &DAG,
8455 const X86Subtarget &Subtarget) {
8456 MVT ShuffleVT = VT;
8457 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8458 unsigned NumElts = VT.getVectorNumElements();
8459 unsigned SizeInBits = VT.getSizeInBits();
8460
8461 // Adjust IndicesVec to match VT size.
8462 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8463 "Illegal variable permute mask size");
8464 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8465 // Narrow/widen the indices vector to the correct size.
8466 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8467 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8468 NumElts * VT.getScalarSizeInBits());
8469 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8470 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8471 SDLoc(IndicesVec), SizeInBits);
8472 // Zero-extend the index elements within the vector.
8473 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8474 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8475 IndicesVT, IndicesVec);
8476 }
8477 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8478
8479 // Handle SrcVec that don't match VT type.
8480 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8481 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8482 // Handle larger SrcVec by treating it as a larger permute.
8483 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8484 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8485 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8486 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8487 Subtarget, DAG, SDLoc(IndicesVec));
8488 SDValue NewSrcVec =
8489 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8490 if (NewSrcVec)
8491 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8492 return SDValue();
8493 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8494 // Widen smaller SrcVec to match VT.
8495 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8496 } else
8497 return SDValue();
8498 }
8499
8500 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8501 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8502 EVT SrcVT = Idx.getValueType();
8503 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8504 uint64_t IndexScale = 0;
8505 uint64_t IndexOffset = 0;
8506
8507 // If we're scaling a smaller permute op, then we need to repeat the
8508 // indices, scaling and offsetting them as well.
8509 // e.g. v4i32 -> v16i8 (Scale = 4)
8510 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8511 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8512 for (uint64_t i = 0; i != Scale; ++i) {
8513 IndexScale |= Scale << (i * NumDstBits);
8514 IndexOffset |= i << (i * NumDstBits);
8515 }
8516
8517 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8518 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8519 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8520 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8521 return Idx;
8522 };
8523
8524 unsigned Opcode = 0;
8525 switch (VT.SimpleTy) {
8526 default:
8527 break;
8528 case MVT::v16i8:
8529 if (Subtarget.hasSSSE3())
8530 Opcode = X86ISD::PSHUFB;
8531 break;
8532 case MVT::v8i16:
8533 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8534 Opcode = X86ISD::VPERMV;
8535 else if (Subtarget.hasSSSE3()) {
8536 Opcode = X86ISD::PSHUFB;
8537 ShuffleVT = MVT::v16i8;
8538 }
8539 break;
8540 case MVT::v4f32:
8541 case MVT::v4i32:
8542 if (Subtarget.hasAVX()) {
8543 Opcode = X86ISD::VPERMILPV;
8544 ShuffleVT = MVT::v4f32;
8545 } else if (Subtarget.hasSSSE3()) {
8546 Opcode = X86ISD::PSHUFB;
8547 ShuffleVT = MVT::v16i8;
8548 }
8549 break;
8550 case MVT::v2f64:
8551 case MVT::v2i64:
8552 if (Subtarget.hasAVX()) {
8553 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8554 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8555 Opcode = X86ISD::VPERMILPV;
8556 ShuffleVT = MVT::v2f64;
8557 } else if (Subtarget.hasSSE41()) {
8558 // SSE41 can compare v2i64 - select between indices 0 and 1.
8559 return DAG.getSelectCC(
8560 DL, IndicesVec,
8561 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8562 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8563 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8565 }
8566 break;
8567 case MVT::v32i8:
8568 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8569 Opcode = X86ISD::VPERMV;
8570 else if (Subtarget.hasXOP()) {
8571 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8572 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8573 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8574 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8575 return DAG.getNode(
8577 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8578 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8579 } else if (Subtarget.hasAVX()) {
8580 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8581 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8582 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8583 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8584 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8585 ArrayRef<SDValue> Ops) {
8586 // Permute Lo and Hi and then select based on index range.
8587 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8588 // care about the bit[7] as its just an index vector.
8589 SDValue Idx = Ops[2];
8590 EVT VT = Idx.getValueType();
8591 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8592 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8593 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8595 };
8596 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8597 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8598 PSHUFBBuilder);
8599 }
8600 break;
8601 case MVT::v16i16:
8602 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8603 Opcode = X86ISD::VPERMV;
8604 else if (Subtarget.hasAVX()) {
8605 // Scale to v32i8 and perform as v32i8.
8606 IndicesVec = ScaleIndices(IndicesVec, 2);
8607 return DAG.getBitcast(
8609 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8610 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8611 }
8612 break;
8613 case MVT::v8f32:
8614 case MVT::v8i32:
8615 if (Subtarget.hasAVX2())
8616 Opcode = X86ISD::VPERMV;
8617 else if (Subtarget.hasAVX()) {
8618 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8619 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8620 {0, 1, 2, 3, 0, 1, 2, 3});
8621 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8622 {4, 5, 6, 7, 4, 5, 6, 7});
8623 if (Subtarget.hasXOP())
8624 return DAG.getBitcast(
8625 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8626 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8627 // Permute Lo and Hi and then select based on index range.
8628 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8629 SDValue Res = DAG.getSelectCC(
8630 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8631 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8632 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8634 return DAG.getBitcast(VT, Res);
8635 }
8636 break;
8637 case MVT::v4i64:
8638 case MVT::v4f64:
8639 if (Subtarget.hasAVX512()) {
8640 if (!Subtarget.hasVLX()) {
8641 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8642 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8643 SDLoc(SrcVec));
8644 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8645 DAG, SDLoc(IndicesVec));
8646 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8647 DAG, Subtarget);
8648 return extract256BitVector(Res, 0, DAG, DL);
8649 }
8650 Opcode = X86ISD::VPERMV;
8651 } else if (Subtarget.hasAVX()) {
8652 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8653 SDValue LoLo =
8654 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8655 SDValue HiHi =
8656 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8657 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8658 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8659 if (Subtarget.hasXOP())
8660 return DAG.getBitcast(
8661 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8662 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8663 // Permute Lo and Hi and then select based on index range.
8664 // This works as VPERMILPD only uses index bit[1] to permute elements.
8665 SDValue Res = DAG.getSelectCC(
8666 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8667 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8668 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8670 return DAG.getBitcast(VT, Res);
8671 }
8672 break;
8673 case MVT::v64i8:
8674 if (Subtarget.hasVBMI())
8675 Opcode = X86ISD::VPERMV;
8676 break;
8677 case MVT::v32i16:
8678 if (Subtarget.hasBWI())
8679 Opcode = X86ISD::VPERMV;
8680 break;
8681 case MVT::v16f32:
8682 case MVT::v16i32:
8683 case MVT::v8f64:
8684 case MVT::v8i64:
8685 if (Subtarget.hasAVX512())
8686 Opcode = X86ISD::VPERMV;
8687 break;
8688 }
8689 if (!Opcode)
8690 return SDValue();
8691
8692 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8693 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8694 "Illegal variable permute shuffle type");
8695
8696 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8697 if (Scale > 1)
8698 IndicesVec = ScaleIndices(IndicesVec, Scale);
8699
8700 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8701 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8702
8703 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8704 SDValue Res = Opcode == X86ISD::VPERMV
8705 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8706 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8707 return DAG.getBitcast(VT, Res);
8708}
8709
8710// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8711// reasoned to be a permutation of a vector by indices in a non-constant vector.
8712// (build_vector (extract_elt V, (extract_elt I, 0)),
8713// (extract_elt V, (extract_elt I, 1)),
8714// ...
8715// ->
8716// (vpermv I, V)
8717//
8718// TODO: Handle undefs
8719// TODO: Utilize pshufb and zero mask blending to support more efficient
8720// construction of vectors with constant-0 elements.
8721static SDValue
8723 SelectionDAG &DAG,
8724 const X86Subtarget &Subtarget) {
8725 SDValue SrcVec, IndicesVec;
8726 // Check for a match of the permute source vector and permute index elements.
8727 // This is done by checking that the i-th build_vector operand is of the form:
8728 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8729 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8730 SDValue Op = V.getOperand(Idx);
8731 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8732 return SDValue();
8733
8734 // If this is the first extract encountered in V, set the source vector,
8735 // otherwise verify the extract is from the previously defined source
8736 // vector.
8737 if (!SrcVec)
8738 SrcVec = Op.getOperand(0);
8739 else if (SrcVec != Op.getOperand(0))
8740 return SDValue();
8741 SDValue ExtractedIndex = Op->getOperand(1);
8742 // Peek through extends.
8743 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8744 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8745 ExtractedIndex = ExtractedIndex.getOperand(0);
8746 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8747 return SDValue();
8748
8749 // If this is the first extract from the index vector candidate, set the
8750 // indices vector, otherwise verify the extract is from the previously
8751 // defined indices vector.
8752 if (!IndicesVec)
8753 IndicesVec = ExtractedIndex.getOperand(0);
8754 else if (IndicesVec != ExtractedIndex.getOperand(0))
8755 return SDValue();
8756
8757 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8758 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8759 return SDValue();
8760 }
8761
8762 MVT VT = V.getSimpleValueType();
8763 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8764}
8765
8766SDValue
8767X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8768 SDLoc dl(Op);
8769
8770 MVT VT = Op.getSimpleValueType();
8771 MVT EltVT = VT.getVectorElementType();
8772 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
8773 unsigned NumElems = Op.getNumOperands();
8774
8775 // Generate vectors for predicate vectors.
8776 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8777 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
8778
8779 if (VT.getVectorElementType() == MVT::bf16 &&
8780 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
8781 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
8782
8783 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
8784 return VectorCst;
8785
8786 unsigned EVTBits = EltVT.getSizeInBits();
8787 APInt UndefMask = APInt::getZero(NumElems);
8788 APInt FrozenUndefMask = APInt::getZero(NumElems);
8789 APInt ZeroMask = APInt::getZero(NumElems);
8790 APInt NonZeroMask = APInt::getZero(NumElems);
8791 bool IsAllConstants = true;
8792 bool OneUseFrozenUndefs = true;
8793 SmallSet<SDValue, 8> Values;
8794 unsigned NumConstants = NumElems;
8795 for (unsigned i = 0; i < NumElems; ++i) {
8796 SDValue Elt = Op.getOperand(i);
8797 if (Elt.isUndef()) {
8798 UndefMask.setBit(i);
8799 continue;
8800 }
8801 if (ISD::isFreezeUndef(Elt.getNode())) {
8802 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
8803 FrozenUndefMask.setBit(i);
8804 continue;
8805 }
8806 Values.insert(Elt);
8807 if (!isIntOrFPConstant(Elt)) {
8808 IsAllConstants = false;
8809 NumConstants--;
8810 }
8811 if (X86::isZeroNode(Elt)) {
8812 ZeroMask.setBit(i);
8813 } else {
8814 NonZeroMask.setBit(i);
8815 }
8816 }
8817
8818 // All undef vector. Return an UNDEF.
8819 if (UndefMask.isAllOnes())
8820 return DAG.getUNDEF(VT);
8821
8822 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
8823 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
8824 return DAG.getFreeze(DAG.getUNDEF(VT));
8825
8826 // All undef/freeze(undef)/zero vector. Return a zero vector.
8827 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
8828 return getZeroVector(VT, Subtarget, DAG, dl);
8829
8830 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
8831 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
8832 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
8833 // and blend the FREEZE-UNDEF operands back in.
8834 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
8835 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
8836 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
8837 SmallVector<int, 16> BlendMask(NumElems, -1);
8838 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
8839 for (unsigned i = 0; i < NumElems; ++i) {
8840 if (UndefMask[i]) {
8841 BlendMask[i] = -1;
8842 continue;
8843 }
8844 BlendMask[i] = i;
8845 if (!FrozenUndefMask[i])
8846 Elts[i] = Op.getOperand(i);
8847 else
8848 BlendMask[i] += NumElems;
8849 }
8850 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
8851 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
8852 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
8853 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
8854 }
8855
8856 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8857
8858 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
8859 // be better off lowering to a smaller build vector and padding with
8860 // undef/zero.
8861 if ((VT.is256BitVector() || VT.is512BitVector()) &&
8863 unsigned UpperElems = NumElems / 2;
8864 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
8865 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
8866 if (NumUpperUndefsOrZeros >= UpperElems) {
8867 if (VT.is512BitVector() &&
8868 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
8869 UpperElems = NumElems - (NumElems / 4);
8870 // If freeze(undef) is in any upper elements, force to zero.
8871 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
8872 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
8873 SDValue NewBV =
8874 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
8875 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
8876 }
8877 }
8878
8879 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
8880 return AddSub;
8881 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
8882 return HorizontalOp;
8883 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
8884 return Broadcast;
8885 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
8886 return BitOp;
8887
8888 unsigned NumZero = ZeroMask.popcount();
8889 unsigned NumNonZero = NonZeroMask.popcount();
8890
8891 // If we are inserting one variable into a vector of non-zero constants, try
8892 // to avoid loading each constant element as a scalar. Load the constants as a
8893 // vector and then insert the variable scalar element. If insertion is not
8894 // supported, fall back to a shuffle to get the scalar blended with the
8895 // constants. Insertion into a zero vector is handled as a special-case
8896 // somewhere below here.
8897 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8898 FrozenUndefMask.isZero() &&
8901 // Create an all-constant vector. The variable element in the old
8902 // build vector is replaced by undef in the constant vector. Save the
8903 // variable scalar element and its index for use in the insertelement.
8904 LLVMContext &Context = *DAG.getContext();
8905 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8906 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8907 SDValue VarElt;
8908 SDValue InsIndex;
8909 for (unsigned i = 0; i != NumElems; ++i) {
8910 SDValue Elt = Op.getOperand(i);
8911 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8912 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8913 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8914 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8915 else if (!Elt.isUndef()) {
8916 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8917 "Expected one variable element in this vector");
8918 VarElt = Elt;
8919 InsIndex = DAG.getVectorIdxConstant(i, dl);
8920 }
8921 }
8922 Constant *CV = ConstantVector::get(ConstVecOps);
8923 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8924
8925 // The constants we just created may not be legal (eg, floating point). We
8926 // must lower the vector right here because we can not guarantee that we'll
8927 // legalize it before loading it. This is also why we could not just create
8928 // a new build vector here. If the build vector contains illegal constants,
8929 // it could get split back up into a series of insert elements.
8930 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8931 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8934 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8935 unsigned InsertC = InsIndex->getAsZExtVal();
8936 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
8937 if (InsertC < NumEltsInLow128Bits)
8938 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8939
8940 // There's no good way to insert into the high elements of a >128-bit
8941 // vector, so use shuffles to avoid an extract/insert sequence.
8942 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
8943 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
8944 SmallVector<int, 8> ShuffleMask;
8945 unsigned NumElts = VT.getVectorNumElements();
8946 for (unsigned i = 0; i != NumElts; ++i)
8947 ShuffleMask.push_back(i == InsertC ? NumElts : i);
8948 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
8949 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
8950 }
8951
8952 // Special case for single non-zero, non-undef, element.
8953 if (NumNonZero == 1) {
8954 unsigned Idx = NonZeroMask.countr_zero();
8955 SDValue Item = Op.getOperand(Idx);
8956
8957 // If we have a constant or non-constant insertion into the low element of
8958 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8959 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8960 // depending on what the source datatype is.
8961 if (Idx == 0) {
8962 if (NumZero == 0)
8963 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8964
8965 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
8966 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
8967 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
8968 assert((VT.is128BitVector() || VT.is256BitVector() ||
8969 VT.is512BitVector()) &&
8970 "Expected an SSE value type!");
8971 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8972 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
8973 // zero vector.
8974 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8975 }
8976
8977 // We can't directly insert an i8 or i16 into a vector, so zero extend
8978 // it to i32 first.
8979 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8980 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8981 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
8982 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8983 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8984 return DAG.getBitcast(VT, Item);
8985 }
8986 }
8987
8988 // Is it a vector logical left shift?
8989 if (NumElems == 2 && Idx == 1 &&
8990 X86::isZeroNode(Op.getOperand(0)) &&
8991 !X86::isZeroNode(Op.getOperand(1))) {
8992 unsigned NumBits = VT.getSizeInBits();
8993 return getVShift(true, VT,
8995 VT, Op.getOperand(1)),
8996 NumBits/2, DAG, *this, dl);
8997 }
8998
8999 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9000 return SDValue();
9001
9002 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9003 // is a non-constant being inserted into an element other than the low one,
9004 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9005 // movd/movss) to move this into the low element, then shuffle it into
9006 // place.
9007 if (EVTBits == 32) {
9008 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9009 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9010 }
9011 }
9012
9013 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9014 if (Values.size() == 1) {
9015 if (EVTBits == 32) {
9016 // Instead of a shuffle like this:
9017 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9018 // Check if it's possible to issue this instead.
9019 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9020 unsigned Idx = NonZeroMask.countr_zero();
9021 SDValue Item = Op.getOperand(Idx);
9022 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9023 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9024 }
9025 return SDValue();
9026 }
9027
9028 // A vector full of immediates; various special cases are already
9029 // handled, so this is best done with a single constant-pool load.
9030 if (IsAllConstants)
9031 return SDValue();
9032
9033 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9034 return V;
9035
9036 // See if we can use a vector load to get all of the elements.
9037 {
9038 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9039 if (SDValue LD =
9040 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9041 return LD;
9042 }
9043
9044 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9045 // build_vector and broadcast it.
9046 // TODO: We could probably generalize this more.
9047 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9048 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9049 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9050 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9051 // Make sure all the even/odd operands match.
9052 for (unsigned i = 2; i != NumElems; ++i)
9053 if (Ops[i % 2] != Op.getOperand(i))
9054 return false;
9055 return true;
9056 };
9057 if (CanSplat(Op, NumElems, Ops)) {
9058 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9059 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9060 // Create a new build vector and cast to v2i64/v2f64.
9061 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9062 DAG.getBuildVector(NarrowVT, dl, Ops));
9063 // Broadcast from v2i64/v2f64 and cast to final VT.
9064 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9065 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9066 NewBV));
9067 }
9068 }
9069
9070 // For AVX-length vectors, build the individual 128-bit pieces and use
9071 // shuffles to put them in place.
9072 if (VT.getSizeInBits() > 128) {
9073 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9074
9075 // Build both the lower and upper subvector.
9076 SDValue Lower =
9077 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9079 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9080
9081 // Recreate the wider vector with the lower and upper part.
9082 return concatSubVectors(Lower, Upper, DAG, dl);
9083 }
9084
9085 // Let legalizer expand 2-wide build_vectors.
9086 if (EVTBits == 64) {
9087 if (NumNonZero == 1) {
9088 // One half is zero or undef.
9089 unsigned Idx = NonZeroMask.countr_zero();
9091 Op.getOperand(Idx));
9092 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9093 }
9094 return SDValue();
9095 }
9096
9097 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9098 if (EVTBits == 8 && NumElems == 16)
9099 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9100 NumZero, DAG, Subtarget))
9101 return V;
9102
9103 if (EltVT == MVT::i16 && NumElems == 8)
9104 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9105 NumZero, DAG, Subtarget))
9106 return V;
9107
9108 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9109 if (EVTBits == 32 && NumElems == 4)
9110 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9111 return V;
9112
9113 // If element VT is == 32 bits, turn it into a number of shuffles.
9114 if (NumElems == 4 && NumZero > 0) {
9115 SmallVector<SDValue, 8> Ops(NumElems);
9116 for (unsigned i = 0; i < 4; ++i) {
9117 bool isZero = !NonZeroMask[i];
9118 if (isZero)
9119 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9120 else
9121 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9122 }
9123
9124 for (unsigned i = 0; i < 2; ++i) {
9125 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9126 default: llvm_unreachable("Unexpected NonZero count");
9127 case 0:
9128 Ops[i] = Ops[i*2]; // Must be a zero vector.
9129 break;
9130 case 1:
9131 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9132 break;
9133 case 2:
9134 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9135 break;
9136 case 3:
9137 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9138 break;
9139 }
9140 }
9141
9142 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9143 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9144 int MaskVec[] = {
9145 Reverse1 ? 1 : 0,
9146 Reverse1 ? 0 : 1,
9147 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9148 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9149 };
9150 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9151 }
9152
9153 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9154
9155 // Check for a build vector from mostly shuffle plus few inserting.
9156 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9157 return Sh;
9158
9159 // For SSE 4.1, use insertps to put the high elements into the low element.
9160 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9162 if (!Op.getOperand(0).isUndef())
9163 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9164 else
9165 Result = DAG.getUNDEF(VT);
9166
9167 for (unsigned i = 1; i < NumElems; ++i) {
9168 if (Op.getOperand(i).isUndef()) continue;
9169 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9170 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
9171 }
9172 return Result;
9173 }
9174
9175 // Otherwise, expand into a number of unpckl*, start by extending each of
9176 // our (non-undef) elements to the full vector width with the element in the
9177 // bottom slot of the vector (which generates no code for SSE).
9178 SmallVector<SDValue, 8> Ops(NumElems);
9179 for (unsigned i = 0; i < NumElems; ++i) {
9180 if (!Op.getOperand(i).isUndef())
9181 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9182 else
9183 Ops[i] = DAG.getUNDEF(VT);
9184 }
9185
9186 // Next, we iteratively mix elements, e.g. for v4f32:
9187 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9188 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9189 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9190 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9191 // Generate scaled UNPCKL shuffle mask.
9193 for(unsigned i = 0; i != Scale; ++i)
9194 Mask.push_back(i);
9195 for (unsigned i = 0; i != Scale; ++i)
9196 Mask.push_back(NumElems+i);
9197 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9198
9199 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9200 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9201 }
9202 return Ops[0];
9203}
9204
9205// 256-bit AVX can use the vinsertf128 instruction
9206// to create 256-bit vectors from two other 128-bit ones.
9207// TODO: Detect subvector broadcast here instead of DAG combine?
9209 const X86Subtarget &Subtarget) {
9210 SDLoc dl(Op);
9211 MVT ResVT = Op.getSimpleValueType();
9212
9213 assert((ResVT.is256BitVector() ||
9214 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9215
9216 unsigned NumOperands = Op.getNumOperands();
9217 unsigned NumFreezeUndef = 0;
9218 unsigned NumZero = 0;
9219 unsigned NumNonZero = 0;
9220 unsigned NonZeros = 0;
9221 for (unsigned i = 0; i != NumOperands; ++i) {
9222 SDValue SubVec = Op.getOperand(i);
9223 if (SubVec.isUndef())
9224 continue;
9225 if (ISD::isFreezeUndef(SubVec.getNode())) {
9226 // If the freeze(undef) has multiple uses then we must fold to zero.
9227 if (SubVec.hasOneUse())
9228 ++NumFreezeUndef;
9229 else
9230 ++NumZero;
9231 }
9232 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9233 ++NumZero;
9234 else {
9235 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9236 NonZeros |= 1 << i;
9237 ++NumNonZero;
9238 }
9239 }
9240
9241 // If we have more than 2 non-zeros, build each half separately.
9242 if (NumNonZero > 2) {
9243 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9244 ArrayRef<SDUse> Ops = Op->ops();
9245 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9246 Ops.slice(0, NumOperands/2));
9247 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9248 Ops.slice(NumOperands/2));
9249 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9250 }
9251
9252 // Otherwise, build it up through insert_subvectors.
9253 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9254 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9255 : DAG.getUNDEF(ResVT));
9256
9257 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9258 unsigned NumSubElems = SubVT.getVectorNumElements();
9259 for (unsigned i = 0; i != NumOperands; ++i) {
9260 if ((NonZeros & (1 << i)) == 0)
9261 continue;
9262
9263 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
9264 Op.getOperand(i),
9265 DAG.getIntPtrConstant(i * NumSubElems, dl));
9266 }
9267
9268 return Vec;
9269}
9270
9271// Returns true if the given node is a type promotion (by concatenating i1
9272// zeros) of the result of a node that already zeros all upper bits of
9273// k-register.
9274// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9276 const X86Subtarget &Subtarget,
9277 SelectionDAG & DAG) {
9278 SDLoc dl(Op);
9279 MVT ResVT = Op.getSimpleValueType();
9280 unsigned NumOperands = Op.getNumOperands();
9281
9282 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9283 "Unexpected number of operands in CONCAT_VECTORS");
9284
9285 uint64_t Zeros = 0;
9286 uint64_t NonZeros = 0;
9287 for (unsigned i = 0; i != NumOperands; ++i) {
9288 SDValue SubVec = Op.getOperand(i);
9289 if (SubVec.isUndef())
9290 continue;
9291 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9292 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9293 Zeros |= (uint64_t)1 << i;
9294 else
9295 NonZeros |= (uint64_t)1 << i;
9296 }
9297
9298 unsigned NumElems = ResVT.getVectorNumElements();
9299
9300 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9301 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9302 // insert_subvector will give us two kshifts.
9303 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9304 Log2_64(NonZeros) != NumOperands - 1) {
9305 unsigned Idx = Log2_64(NonZeros);
9306 SDValue SubVec = Op.getOperand(Idx);
9307 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9308 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9309 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9310 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9311 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9312 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9313 DAG.getIntPtrConstant(0, dl));
9314 }
9315
9316 // If there are zero or one non-zeros we can handle this very simply.
9317 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9318 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9319 if (!NonZeros)
9320 return Vec;
9321 unsigned Idx = Log2_64(NonZeros);
9322 SDValue SubVec = Op.getOperand(Idx);
9323 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9324 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9325 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
9326 }
9327
9328 if (NumOperands > 2) {
9329 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9330 ArrayRef<SDUse> Ops = Op->ops();
9331 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9332 Ops.slice(0, NumOperands/2));
9333 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9334 Ops.slice(NumOperands/2));
9335 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9336 }
9337
9338 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9339
9340 if (ResVT.getVectorNumElements() >= 16)
9341 return Op; // The operation is legal with KUNPCK
9342
9343 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
9344 DAG.getUNDEF(ResVT), Op.getOperand(0),
9345 DAG.getIntPtrConstant(0, dl));
9346 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9347 DAG.getIntPtrConstant(NumElems/2, dl));
9348}
9349
9351 const X86Subtarget &Subtarget,
9352 SelectionDAG &DAG) {
9353 MVT VT = Op.getSimpleValueType();
9354 if (VT.getVectorElementType() == MVT::i1)
9355 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9356
9357 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9358 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9359 Op.getNumOperands() == 4)));
9360
9361 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9362 // from two other 128-bit ones.
9363
9364 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9365 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9366}
9367
9368//===----------------------------------------------------------------------===//
9369// Vector shuffle lowering
9370//
9371// This is an experimental code path for lowering vector shuffles on x86. It is
9372// designed to handle arbitrary vector shuffles and blends, gracefully
9373// degrading performance as necessary. It works hard to recognize idiomatic
9374// shuffles and lower them to optimal instruction patterns without leaving
9375// a framework that allows reasonably efficient handling of all vector shuffle
9376// patterns.
9377//===----------------------------------------------------------------------===//
9378
9379/// Tiny helper function to identify a no-op mask.
9380///
9381/// This is a somewhat boring predicate function. It checks whether the mask
9382/// array input, which is assumed to be a single-input shuffle mask of the kind
9383/// used by the X86 shuffle instructions (not a fully general
9384/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9385/// in-place shuffle are 'no-op's.
9387 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9388 assert(Mask[i] >= -1 && "Out of bound mask element!");
9389 if (Mask[i] >= 0 && Mask[i] != i)
9390 return false;
9391 }
9392 return true;
9393}
9394
9395/// Test whether there are elements crossing LaneSizeInBits lanes in this
9396/// shuffle mask.
9397///
9398/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9399/// and we routinely test for these.
9400static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9401 unsigned ScalarSizeInBits,
9402 ArrayRef<int> Mask) {
9403 assert(LaneSizeInBits && ScalarSizeInBits &&
9404 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9405 "Illegal shuffle lane size");
9406 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9407 int Size = Mask.size();
9408 for (int i = 0; i < Size; ++i)
9409 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9410 return true;
9411 return false;
9412}
9413
9414/// Test whether there are elements crossing 128-bit lanes in this
9415/// shuffle mask.
9417 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9418}
9419
9420/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9421/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9422/// better support 'repeated mask + lane permute' style shuffles.
9423static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9424 unsigned ScalarSizeInBits,
9425 ArrayRef<int> Mask) {
9426 assert(LaneSizeInBits && ScalarSizeInBits &&
9427 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9428 "Illegal shuffle lane size");
9429 int NumElts = Mask.size();
9430 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9431 int NumLanes = NumElts / NumEltsPerLane;
9432 if (NumLanes > 1) {
9433 for (int i = 0; i != NumLanes; ++i) {
9434 int SrcLane = -1;
9435 for (int j = 0; j != NumEltsPerLane; ++j) {
9436 int M = Mask[(i * NumEltsPerLane) + j];
9437 if (M < 0)
9438 continue;
9439 int Lane = (M % NumElts) / NumEltsPerLane;
9440 if (SrcLane >= 0 && SrcLane != Lane)
9441 return true;
9442 SrcLane = Lane;
9443 }
9444 }
9445 }
9446 return false;
9447}
9448
9449/// Test whether a shuffle mask is equivalent within each sub-lane.
9450///
9451/// This checks a shuffle mask to see if it is performing the same
9452/// lane-relative shuffle in each sub-lane. This trivially implies
9453/// that it is also not lane-crossing. It may however involve a blend from the
9454/// same lane of a second vector.
9455///
9456/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9457/// non-trivial to compute in the face of undef lanes. The representation is
9458/// suitable for use with existing 128-bit shuffles as entries from the second
9459/// vector have been remapped to [LaneSize, 2*LaneSize).
9460static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9461 ArrayRef<int> Mask,
9462 SmallVectorImpl<int> &RepeatedMask) {
9463 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9464 RepeatedMask.assign(LaneSize, -1);
9465 int Size = Mask.size();
9466 for (int i = 0; i < Size; ++i) {
9467 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9468 if (Mask[i] < 0)
9469 continue;
9470 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9471 // This entry crosses lanes, so there is no way to model this shuffle.
9472 return false;
9473
9474 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9475 // Adjust second vector indices to start at LaneSize instead of Size.
9476 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9477 : Mask[i] % LaneSize + LaneSize;
9478 if (RepeatedMask[i % LaneSize] < 0)
9479 // This is the first non-undef entry in this slot of a 128-bit lane.
9480 RepeatedMask[i % LaneSize] = LocalM;
9481 else if (RepeatedMask[i % LaneSize] != LocalM)
9482 // Found a mismatch with the repeated mask.
9483 return false;
9484 }
9485 return true;
9486}
9487
9488/// Test whether a shuffle mask is equivalent within each 128-bit lane.
9489static bool
9491 SmallVectorImpl<int> &RepeatedMask) {
9492 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9493}
9494
9495static bool
9497 SmallVector<int, 32> RepeatedMask;
9498 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9499}
9500
9501/// Test whether a shuffle mask is equivalent within each 256-bit lane.
9502static bool
9504 SmallVectorImpl<int> &RepeatedMask) {
9505 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9506}
9507
9508/// Test whether a target shuffle mask is equivalent within each sub-lane.
9509/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9510static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9511 unsigned EltSizeInBits,
9512 ArrayRef<int> Mask,
9513 SmallVectorImpl<int> &RepeatedMask) {
9514 int LaneSize = LaneSizeInBits / EltSizeInBits;
9515 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9516 int Size = Mask.size();
9517 for (int i = 0; i < Size; ++i) {
9518 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9519 if (Mask[i] == SM_SentinelUndef)
9520 continue;
9521 if (Mask[i] == SM_SentinelZero) {
9522 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9523 return false;
9524 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9525 continue;
9526 }
9527 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9528 // This entry crosses lanes, so there is no way to model this shuffle.
9529 return false;
9530
9531 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9532 // later vector indices to start at multiples of LaneSize instead of Size.
9533 int LaneM = Mask[i] / Size;
9534 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9535 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9536 // This is the first non-undef entry in this slot of a 128-bit lane.
9537 RepeatedMask[i % LaneSize] = LocalM;
9538 else if (RepeatedMask[i % LaneSize] != LocalM)
9539 // Found a mismatch with the repeated mask.
9540 return false;
9541 }
9542 return true;
9543}
9544
9545/// Test whether a target shuffle mask is equivalent within each sub-lane.
9546/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9547static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9548 ArrayRef<int> Mask,
9549 SmallVectorImpl<int> &RepeatedMask) {
9550 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9551 Mask, RepeatedMask);
9552}
9553
9554/// Checks whether the vector elements referenced by two shuffle masks are
9555/// equivalent.
9556static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9557 int Idx, int ExpectedIdx) {
9558 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9559 ExpectedIdx < MaskSize && "Out of range element index");
9560 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9561 return false;
9562
9563 switch (Op.getOpcode()) {
9564 case ISD::BUILD_VECTOR:
9565 // If the values are build vectors, we can look through them to find
9566 // equivalent inputs that make the shuffles equivalent.
9567 // TODO: Handle MaskSize != Op.getNumOperands()?
9568 if (MaskSize == (int)Op.getNumOperands() &&
9569 MaskSize == (int)ExpectedOp.getNumOperands())
9570 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9571 break;
9572 case X86ISD::VBROADCAST:
9574 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9575 return (Op == ExpectedOp &&
9576 (int)Op.getValueType().getVectorNumElements() == MaskSize);
9577 case X86ISD::HADD:
9578 case X86ISD::HSUB:
9579 case X86ISD::FHADD:
9580 case X86ISD::FHSUB:
9581 case X86ISD::PACKSS:
9582 case X86ISD::PACKUS:
9583 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9584 // TODO: Handle MaskSize != NumElts?
9585 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9586 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9587 MVT VT = Op.getSimpleValueType();
9588 int NumElts = VT.getVectorNumElements();
9589 if (MaskSize == NumElts) {
9590 int NumLanes = VT.getSizeInBits() / 128;
9591 int NumEltsPerLane = NumElts / NumLanes;
9592 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9593 bool SameLane =
9594 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9595 bool SameElt =
9596 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9597 return SameLane && SameElt;
9598 }
9599 }
9600 break;
9601 }
9602
9603 return false;
9604}
9605
9606/// Checks whether a shuffle mask is equivalent to an explicit list of
9607/// arguments.
9608///
9609/// This is a fast way to test a shuffle mask against a fixed pattern:
9610///
9611/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9612///
9613/// It returns true if the mask is exactly as wide as the argument list, and
9614/// each element of the mask is either -1 (signifying undef) or the value given
9615/// in the argument.
9616static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9617 SDValue V1 = SDValue(),
9618 SDValue V2 = SDValue()) {
9619 int Size = Mask.size();
9620 if (Size != (int)ExpectedMask.size())
9621 return false;
9622
9623 for (int i = 0; i < Size; ++i) {
9624 assert(Mask[i] >= -1 && "Out of bound mask element!");
9625 int MaskIdx = Mask[i];
9626 int ExpectedIdx = ExpectedMask[i];
9627 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9628 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9629 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9630 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9631 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9632 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9633 return false;
9634 }
9635 }
9636 return true;
9637}
9638
9639/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9640///
9641/// The masks must be exactly the same width.
9642///
9643/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9644/// value in ExpectedMask is always accepted. Otherwise the indices must match.
9645///
9646/// SM_SentinelZero is accepted as a valid negative index but must match in
9647/// both, or via a known bits test.
9649 ArrayRef<int> ExpectedMask,
9650 const SelectionDAG &DAG,
9651 SDValue V1 = SDValue(),
9652 SDValue V2 = SDValue()) {
9653 int Size = Mask.size();
9654 if (Size != (int)ExpectedMask.size())
9655 return false;
9656 assert(llvm::all_of(ExpectedMask,
9657 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9658 "Illegal target shuffle mask");
9659
9660 // Check for out-of-range target shuffle mask indices.
9661 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9662 return false;
9663
9664 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9665 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9666 !V1.getValueType().isVector()))
9667 V1 = SDValue();
9668 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9669 !V2.getValueType().isVector()))
9670 V2 = SDValue();
9671
9672 APInt ZeroV1 = APInt::getZero(Size);
9673 APInt ZeroV2 = APInt::getZero(Size);
9674
9675 for (int i = 0; i < Size; ++i) {
9676 int MaskIdx = Mask[i];
9677 int ExpectedIdx = ExpectedMask[i];
9678 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9679 continue;
9680 if (MaskIdx == SM_SentinelZero) {
9681 // If we need this expected index to be a zero element, then update the
9682 // relevant zero mask and perform the known bits at the end to minimize
9683 // repeated computes.
9684 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9685 if (ExpectedV &&
9686 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9687 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9688 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9689 ZeroMask.setBit(BitIdx);
9690 continue;
9691 }
9692 }
9693 if (MaskIdx >= 0) {
9694 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9695 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9696 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9697 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9698 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9699 continue;
9700 }
9701 return false;
9702 }
9703 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9704 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9705}
9706
9707// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9708// instructions.
9710 const SelectionDAG &DAG) {
9711 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9712 return false;
9713
9714 SmallVector<int, 8> Unpcklwd;
9715 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9716 /* Unary = */ false);
9717 SmallVector<int, 8> Unpckhwd;
9718 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9719 /* Unary = */ false);
9720 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9721 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9722 return IsUnpackwdMask;
9723}
9724
9726 const SelectionDAG &DAG) {
9727 // Create 128-bit vector type based on mask size.
9728 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9729 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9730
9731 // We can't assume a canonical shuffle mask, so try the commuted version too.
9732 SmallVector<int, 4> CommutedMask(Mask);
9734
9735 // Match any of unary/binary or low/high.
9736 for (unsigned i = 0; i != 4; ++i) {
9737 SmallVector<int, 16> UnpackMask;
9738 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9739 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9740 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9741 return true;
9742 }
9743 return false;
9744}
9745
9746/// Return true if a shuffle mask chooses elements identically in its top and
9747/// bottom halves. For example, any splat mask has the same top and bottom
9748/// halves. If an element is undefined in only one half of the mask, the halves
9749/// are not considered identical.
9751 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9752 unsigned HalfSize = Mask.size() / 2;
9753 for (unsigned i = 0; i != HalfSize; ++i) {
9754 if (Mask[i] != Mask[i + HalfSize])
9755 return false;
9756 }
9757 return true;
9758}
9759
9760/// Get a 4-lane 8-bit shuffle immediate for a mask.
9761///
9762/// This helper function produces an 8-bit shuffle immediate corresponding to
9763/// the ubiquitous shuffle encoding scheme used in x86 instructions for
9764/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9765/// example.
9766///
9767/// NB: We rely heavily on "undef" masks preserving the input lane.
9768static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9769 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9770 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9771 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9772 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9773 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9774
9775 // If the mask only uses one non-undef element, then fully 'splat' it to
9776 // improve later broadcast matching.
9777 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
9778 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
9779
9780 int FirstElt = Mask[FirstIndex];
9781 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
9782 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
9783
9784 unsigned Imm = 0;
9785 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9786 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9787 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9788 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9789 return Imm;
9790}
9791
9793 SelectionDAG &DAG) {
9794 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9795}
9796
9797// The Shuffle result is as follow:
9798// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9799// Each Zeroable's element correspond to a particular Mask's element.
9800// As described in computeZeroableShuffleElements function.
9801//
9802// The function looks for a sub-mask that the nonzero elements are in
9803// increasing order. If such sub-mask exist. The function returns true.
9804static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9805 ArrayRef<int> Mask, const EVT &VectorType,
9806 bool &IsZeroSideLeft) {
9807 int NextElement = -1;
9808 // Check if the Mask's nonzero elements are in increasing order.
9809 for (int i = 0, e = Mask.size(); i < e; i++) {
9810 // Checks if the mask's zeros elements are built from only zeros.
9811 assert(Mask[i] >= -1 && "Out of bound mask element!");
9812 if (Mask[i] < 0)
9813 return false;
9814 if (Zeroable[i])
9815 continue;
9816 // Find the lowest non zero element
9817 if (NextElement < 0) {
9818 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9819 IsZeroSideLeft = NextElement != 0;
9820 }
9821 // Exit if the mask's non zero elements are not in increasing order.
9822 if (NextElement != Mask[i])
9823 return false;
9824 NextElement++;
9825 }
9826 return true;
9827}
9828
9829/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9831 ArrayRef<int> Mask, SDValue V1,
9832 SDValue V2, const APInt &Zeroable,
9833 const X86Subtarget &Subtarget,
9834 SelectionDAG &DAG) {
9835 int Size = Mask.size();
9836 int LaneSize = 128 / VT.getScalarSizeInBits();
9837 const int NumBytes = VT.getSizeInBits() / 8;
9838 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9839
9840 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9841 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9842 (Subtarget.hasBWI() && VT.is512BitVector()));
9843
9844 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9845 // Sign bit set in i8 mask means zero element.
9846 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9847
9848 SDValue V;
9849 for (int i = 0; i < NumBytes; ++i) {
9850 int M = Mask[i / NumEltBytes];
9851 if (M < 0) {
9852 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9853 continue;
9854 }
9855 if (Zeroable[i / NumEltBytes]) {
9856 PSHUFBMask[i] = ZeroMask;
9857 continue;
9858 }
9859
9860 // We can only use a single input of V1 or V2.
9861 SDValue SrcV = (M >= Size ? V2 : V1);
9862 if (V && V != SrcV)
9863 return SDValue();
9864 V = SrcV;
9865 M %= Size;
9866
9867 // PSHUFB can't cross lanes, ensure this doesn't happen.
9868 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9869 return SDValue();
9870
9871 M = M % LaneSize;
9872 M = M * NumEltBytes + (i % NumEltBytes);
9873 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9874 }
9875 assert(V && "Failed to find a source input");
9876
9877 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9878 return DAG.getBitcast(
9879 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9880 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9881}
9882
9883static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9884 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9885 const SDLoc &dl);
9886
9887// X86 has dedicated shuffle that can be lowered to VEXPAND
9889 const APInt &Zeroable,
9890 ArrayRef<int> Mask, SDValue &V1,
9891 SDValue &V2, SelectionDAG &DAG,
9892 const X86Subtarget &Subtarget) {
9893 bool IsLeftZeroSide = true;
9894 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9895 IsLeftZeroSide))
9896 return SDValue();
9897 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9899 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9900 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9901 unsigned NumElts = VT.getVectorNumElements();
9902 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9903 "Unexpected number of vector elements");
9904 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9905 Subtarget, DAG, DL);
9906 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9907 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9908 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
9909}
9910
9911static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9912 unsigned &UnpackOpcode, bool IsUnary,
9913 ArrayRef<int> TargetMask, const SDLoc &DL,
9914 SelectionDAG &DAG,
9915 const X86Subtarget &Subtarget) {
9916 int NumElts = VT.getVectorNumElements();
9917
9918 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9919 for (int i = 0; i != NumElts; i += 2) {
9920 int M1 = TargetMask[i + 0];
9921 int M2 = TargetMask[i + 1];
9922 Undef1 &= (SM_SentinelUndef == M1);
9923 Undef2 &= (SM_SentinelUndef == M2);
9924 Zero1 &= isUndefOrZero(M1);
9925 Zero2 &= isUndefOrZero(M2);
9926 }
9927 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9928 "Zeroable shuffle detected");
9929
9930 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9931 SmallVector<int, 64> Unpckl, Unpckh;
9932 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9933 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
9934 (IsUnary ? V1 : V2))) {
9935 UnpackOpcode = X86ISD::UNPCKL;
9936 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9937 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9938 return true;
9939 }
9940
9941 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9942 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
9943 (IsUnary ? V1 : V2))) {
9944 UnpackOpcode = X86ISD::UNPCKH;
9945 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9946 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9947 return true;
9948 }
9949
9950 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9951 if (IsUnary && (Zero1 || Zero2)) {
9952 // Don't bother if we can blend instead.
9953 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9954 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9955 return false;
9956
9957 bool MatchLo = true, MatchHi = true;
9958 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9959 int M = TargetMask[i];
9960
9961 // Ignore if the input is known to be zero or the index is undef.
9962 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9963 (M == SM_SentinelUndef))
9964 continue;
9965
9966 MatchLo &= (M == Unpckl[i]);
9967 MatchHi &= (M == Unpckh[i]);
9968 }
9969
9970 if (MatchLo || MatchHi) {
9971 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9972 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9973 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9974 return true;
9975 }
9976 }
9977
9978 // If a binary shuffle, commute and try again.
9979 if (!IsUnary) {
9981 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
9982 UnpackOpcode = X86ISD::UNPCKL;
9983 std::swap(V1, V2);
9984 return true;
9985 }
9986
9988 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
9989 UnpackOpcode = X86ISD::UNPCKH;
9990 std::swap(V1, V2);
9991 return true;
9992 }
9993 }
9994
9995 return false;
9996}
9997
9998// X86 has dedicated unpack instructions that can handle specific blend
9999// operations: UNPCKH and UNPCKL.
10001 ArrayRef<int> Mask, SDValue V1, SDValue V2,
10002 SelectionDAG &DAG) {
10003 SmallVector<int, 8> Unpckl;
10004 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10005 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10006 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10007
10008 SmallVector<int, 8> Unpckh;
10009 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10010 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10011 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10012
10013 // Commute and try again.
10015 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10016 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10017
10019 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10020 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10021
10022 return SDValue();
10023}
10024
10025/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10026/// followed by unpack 256-bit.
10028 ArrayRef<int> Mask, SDValue V1,
10029 SDValue V2, SelectionDAG &DAG) {
10030 SmallVector<int, 32> Unpckl, Unpckh;
10031 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10032 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10033
10034 unsigned UnpackOpcode;
10035 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10036 UnpackOpcode = X86ISD::UNPCKL;
10037 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10038 UnpackOpcode = X86ISD::UNPCKH;
10039 else
10040 return SDValue();
10041
10042 // This is a "natural" unpack operation (rather than the 128-bit sectored
10043 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10044 // input in order to use the x86 instruction.
10045 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10046 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10047 V1 = DAG.getBitcast(VT, V1);
10048 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10049}
10050
10051// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10052// source into the lower elements and zeroing the upper elements.
10053static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10054 ArrayRef<int> Mask, const APInt &Zeroable,
10055 const X86Subtarget &Subtarget) {
10056 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10057 return false;
10058
10059 unsigned NumElts = Mask.size();
10060 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10061 unsigned MaxScale = 64 / EltSizeInBits;
10062
10063 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10064 unsigned SrcEltBits = EltSizeInBits * Scale;
10065 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10066 continue;
10067 unsigned NumSrcElts = NumElts / Scale;
10068 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10069 continue;
10070 unsigned UpperElts = NumElts - NumSrcElts;
10071 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10072 continue;
10073 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10074 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10075 DstVT = MVT::getIntegerVT(EltSizeInBits);
10076 if ((NumSrcElts * EltSizeInBits) >= 128) {
10077 // ISD::TRUNCATE
10078 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10079 } else {
10080 // X86ISD::VTRUNC
10081 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10082 }
10083 return true;
10084 }
10085
10086 return false;
10087}
10088
10089// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10090// element padding to the final DstVT.
10091static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10092 const X86Subtarget &Subtarget,
10093 SelectionDAG &DAG, bool ZeroUppers) {
10094 MVT SrcVT = Src.getSimpleValueType();
10095 MVT DstSVT = DstVT.getScalarType();
10096 unsigned NumDstElts = DstVT.getVectorNumElements();
10097 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10098 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10099
10100 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10101 return SDValue();
10102
10103 // Perform a direct ISD::TRUNCATE if possible.
10104 if (NumSrcElts == NumDstElts)
10105 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10106
10107 if (NumSrcElts > NumDstElts) {
10108 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10109 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10110 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10111 }
10112
10113 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10114 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10115 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10116 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10117 DstVT.getSizeInBits());
10118 }
10119
10120 // Non-VLX targets must truncate from a 512-bit type, so we need to
10121 // widen, truncate and then possibly extract the original subvector.
10122 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10123 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10124 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10125 }
10126
10127 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10128 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10129 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10130 if (DstVT != TruncVT)
10131 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10132 DstVT.getSizeInBits());
10133 return Trunc;
10134}
10135
10136// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10137//
10138// An example is the following:
10139//
10140// t0: ch = EntryToken
10141// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10142// t25: v4i32 = truncate t2
10143// t41: v8i16 = bitcast t25
10144// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10145// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10146// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10147// t18: v2i64 = bitcast t51
10148//
10149// One can just use a single vpmovdw instruction, without avx512vl we need to
10150// use the zmm variant and extract the lower subvector, padding with zeroes.
10151// TODO: Merge with lowerShuffleAsVTRUNC.
10153 SDValue V2, ArrayRef<int> Mask,
10154 const APInt &Zeroable,
10155 const X86Subtarget &Subtarget,
10156 SelectionDAG &DAG) {
10157 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10158 if (!Subtarget.hasAVX512())
10159 return SDValue();
10160
10161 unsigned NumElts = VT.getVectorNumElements();
10162 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10163 unsigned MaxScale = 64 / EltSizeInBits;
10164 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10165 unsigned SrcEltBits = EltSizeInBits * Scale;
10166 unsigned NumSrcElts = NumElts / Scale;
10167 unsigned UpperElts = NumElts - NumSrcElts;
10168 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10169 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10170 continue;
10171
10172 // Attempt to find a matching source truncation, but as a fall back VLX
10173 // cases can use the VPMOV directly.
10174 SDValue Src = peekThroughBitcasts(V1);
10175 if (Src.getOpcode() == ISD::TRUNCATE &&
10176 Src.getScalarValueSizeInBits() == SrcEltBits) {
10177 Src = Src.getOperand(0);
10178 } else if (Subtarget.hasVLX()) {
10179 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10180 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10181 Src = DAG.getBitcast(SrcVT, Src);
10182 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10183 if (Scale == 2 &&
10184 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10185 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10186 return SDValue();
10187 } else
10188 return SDValue();
10189
10190 // VPMOVWB is only available with avx512bw.
10191 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10192 return SDValue();
10193
10194 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10195 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10196 }
10197
10198 return SDValue();
10199}
10200
10201// Attempt to match binary shuffle patterns as a truncate.
10203 SDValue V2, ArrayRef<int> Mask,
10204 const APInt &Zeroable,
10205 const X86Subtarget &Subtarget,
10206 SelectionDAG &DAG) {
10207 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10208 "Unexpected VTRUNC type");
10209 if (!Subtarget.hasAVX512())
10210 return SDValue();
10211
10212 unsigned NumElts = VT.getVectorNumElements();
10213 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10214 unsigned MaxScale = 64 / EltSizeInBits;
10215 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10216 // TODO: Support non-BWI VPMOVWB truncations?
10217 unsigned SrcEltBits = EltSizeInBits * Scale;
10218 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10219 continue;
10220
10221 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10222 // Bail if the V2 elements are undef.
10223 unsigned NumHalfSrcElts = NumElts / Scale;
10224 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10225 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10226 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10227 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10228 continue;
10229
10230 // The elements beyond the truncation must be undef/zero.
10231 unsigned UpperElts = NumElts - NumSrcElts;
10232 if (UpperElts > 0 &&
10233 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10234 continue;
10235 bool UndefUppers =
10236 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10237
10238 // For offset truncations, ensure that the concat is cheap.
10239 if (Offset) {
10240 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
10241 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10242 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
10243 return Lo.getOperand(0) == Hi.getOperand(0);
10244 if (ISD::isNormalLoad(Lo.getNode()) &&
10245 ISD::isNormalLoad(Hi.getNode())) {
10246 auto *LDLo = cast<LoadSDNode>(Lo);
10247 auto *LDHi = cast<LoadSDNode>(Hi);
10249 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10250 }
10251 return false;
10252 };
10253 if (!IsCheapConcat(V1, V2))
10254 continue;
10255 }
10256
10257 // As we're using both sources then we need to concat them together
10258 // and truncate from the double-sized src.
10259 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10260 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10261
10262 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10263 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10264 Src = DAG.getBitcast(SrcVT, Src);
10265
10266 // Shift the offset'd elements into place for the truncation.
10267 // TODO: Use getTargetVShiftByConstNode.
10268 if (Offset)
10269 Src = DAG.getNode(
10270 X86ISD::VSRLI, DL, SrcVT, Src,
10271 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10272
10273 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10274 }
10275 }
10276
10277 return SDValue();
10278}
10279
10280/// Check whether a compaction lowering can be done by dropping even/odd
10281/// elements and compute how many times even/odd elements must be dropped.
10282///
10283/// This handles shuffles which take every Nth element where N is a power of
10284/// two. Example shuffle masks:
10285///
10286/// (even)
10287/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10288/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10289/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10290/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10291/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10292/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10293///
10294/// (odd)
10295/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10296/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10297///
10298/// Any of these lanes can of course be undef.
10299///
10300/// This routine only supports N <= 3.
10301/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10302/// for larger N.
10303///
10304/// \returns N above, or the number of times even/odd elements must be dropped
10305/// if there is such a number. Otherwise returns zero.
10306static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10307 bool IsSingleInput) {
10308 // The modulus for the shuffle vector entries is based on whether this is
10309 // a single input or not.
10310 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10311 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10312 "We should only be called with masks with a power-of-2 size!");
10313
10314 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10315 int Offset = MatchEven ? 0 : 1;
10316
10317 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10318 // and 2^3 simultaneously. This is because we may have ambiguity with
10319 // partially undef inputs.
10320 bool ViableForN[3] = {true, true, true};
10321
10322 for (int i = 0, e = Mask.size(); i < e; ++i) {
10323 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10324 // want.
10325 if (Mask[i] < 0)
10326 continue;
10327
10328 bool IsAnyViable = false;
10329 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10330 if (ViableForN[j]) {
10331 uint64_t N = j + 1;
10332
10333 // The shuffle mask must be equal to (i * 2^N) % M.
10334 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10335 IsAnyViable = true;
10336 else
10337 ViableForN[j] = false;
10338 }
10339 // Early exit if we exhaust the possible powers of two.
10340 if (!IsAnyViable)
10341 break;
10342 }
10343
10344 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10345 if (ViableForN[j])
10346 return j + 1;
10347
10348 // Return 0 as there is no viable power of two.
10349 return 0;
10350}
10351
10352// X86 has dedicated pack instructions that can handle specific truncation
10353// operations: PACKSS and PACKUS.
10354// Checks for compaction shuffle masks if MaxStages > 1.
10355// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10356static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10357 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10358 const SelectionDAG &DAG,
10359 const X86Subtarget &Subtarget,
10360 unsigned MaxStages = 1) {
10361 unsigned NumElts = VT.getVectorNumElements();
10362 unsigned BitSize = VT.getScalarSizeInBits();
10363 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10364 "Illegal maximum compaction");
10365
10366 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10367 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10368 unsigned NumPackedBits = NumSrcBits - BitSize;
10369 N1 = peekThroughBitcasts(N1);
10370 N2 = peekThroughBitcasts(N2);
10371 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10372 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10373 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10374 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10375 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10376 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10377 return false;
10378 if (Subtarget.hasSSE41() || BitSize == 8) {
10379 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10380 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10381 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10382 V1 = N1;
10383 V2 = N2;
10384 SrcVT = PackVT;
10385 PackOpcode = X86ISD::PACKUS;
10386 return true;
10387 }
10388 }
10389 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10390 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10391 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10392 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10393 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10394 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10395 V1 = N1;
10396 V2 = N2;
10397 SrcVT = PackVT;
10398 PackOpcode = X86ISD::PACKSS;
10399 return true;
10400 }
10401 return false;
10402 };
10403
10404 // Attempt to match against wider and wider compaction patterns.
10405 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10406 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10407 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10408
10409 // Try binary shuffle.
10410 SmallVector<int, 32> BinaryMask;
10411 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10412 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10413 if (MatchPACK(V1, V2, PackVT))
10414 return true;
10415
10416 // Try unary shuffle.
10417 SmallVector<int, 32> UnaryMask;
10418 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10419 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10420 if (MatchPACK(V1, V1, PackVT))
10421 return true;
10422 }
10423
10424 return false;
10425}
10426
10428 SDValue V1, SDValue V2, SelectionDAG &DAG,
10429 const X86Subtarget &Subtarget) {
10430 MVT PackVT;
10431 unsigned PackOpcode;
10432 unsigned SizeBits = VT.getSizeInBits();
10433 unsigned EltBits = VT.getScalarSizeInBits();
10434 unsigned MaxStages = Log2_32(64 / EltBits);
10435 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10436 Subtarget, MaxStages))
10437 return SDValue();
10438
10439 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10440 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10441
10442 // Don't lower multi-stage packs on AVX512, truncation is better.
10443 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10444 return SDValue();
10445
10446 // Pack to the largest type possible:
10447 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10448 unsigned MaxPackBits = 16;
10449 if (CurrentEltBits > 16 &&
10450 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10451 MaxPackBits = 32;
10452
10453 // Repeatedly pack down to the target size.
10454 SDValue Res;
10455 for (unsigned i = 0; i != NumStages; ++i) {
10456 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10457 unsigned NumSrcElts = SizeBits / SrcEltBits;
10458 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10459 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10460 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10461 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10462 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10463 DAG.getBitcast(SrcVT, V2));
10464 V1 = V2 = Res;
10465 CurrentEltBits /= 2;
10466 }
10467 assert(Res && Res.getValueType() == VT &&
10468 "Failed to lower compaction shuffle");
10469 return Res;
10470}
10471
10472/// Try to emit a bitmask instruction for a shuffle.
10473///
10474/// This handles cases where we can model a blend exactly as a bitmask due to
10475/// one of the inputs being zeroable.
10477 SDValue V2, ArrayRef<int> Mask,
10478 const APInt &Zeroable,
10479 const X86Subtarget &Subtarget,
10480 SelectionDAG &DAG) {
10481 MVT MaskVT = VT;
10482 MVT EltVT = VT.getVectorElementType();
10483 SDValue Zero, AllOnes;
10484 // Use f64 if i64 isn't legal.
10485 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10486 EltVT = MVT::f64;
10487 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10488 }
10489
10490 MVT LogicVT = VT;
10491 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10492 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10493 APFloat AllOnesValue =
10495 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10496 LogicVT =
10497 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10498 } else {
10499 Zero = DAG.getConstant(0, DL, EltVT);
10500 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10501 }
10502
10503 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10504 SDValue V;
10505 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10506 if (Zeroable[i])
10507 continue;
10508 if (Mask[i] % Size != i)
10509 return SDValue(); // Not a blend.
10510 if (!V)
10511 V = Mask[i] < Size ? V1 : V2;
10512 else if (V != (Mask[i] < Size ? V1 : V2))
10513 return SDValue(); // Can only let one input through the mask.
10514
10515 VMaskOps[i] = AllOnes;
10516 }
10517 if (!V)
10518 return SDValue(); // No non-zeroable elements!
10519
10520 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10521 VMask = DAG.getBitcast(LogicVT, VMask);
10522 V = DAG.getBitcast(LogicVT, V);
10523 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10524 return DAG.getBitcast(VT, And);
10525}
10526
10527/// Try to emit a blend instruction for a shuffle using bit math.
10528///
10529/// This is used as a fallback approach when first class blend instructions are
10530/// unavailable. Currently it is only suitable for integer vectors, but could
10531/// be generalized for floating point vectors if desirable.
10533 SDValue V2, ArrayRef<int> Mask,
10534 SelectionDAG &DAG) {
10535 assert(VT.isInteger() && "Only supports integer vector types!");
10536 MVT EltVT = VT.getVectorElementType();
10537 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10538 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10540 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10541 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10542 return SDValue(); // Shuffled input!
10543 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10544 }
10545
10546 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10547 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10548}
10549
10551 SDValue PreservedSrc,
10552 const X86Subtarget &Subtarget,
10553 SelectionDAG &DAG);
10554
10557 const APInt &Zeroable, bool &ForceV1Zero,
10558 bool &ForceV2Zero, uint64_t &BlendMask) {
10559 bool V1IsZeroOrUndef =
10561 bool V2IsZeroOrUndef =
10562 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10563
10564 BlendMask = 0;
10565 ForceV1Zero = false, ForceV2Zero = false;
10566 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10567
10568 int NumElts = Mask.size();
10569 int NumLanes = VT.getSizeInBits() / 128;
10570 int NumEltsPerLane = NumElts / NumLanes;
10571 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10572
10573 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10574 // then ensure the blend mask part for that lane just references that input.
10575 bool ForceWholeLaneMasks =
10576 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10577
10578 // Attempt to generate the binary blend mask. If an input is zero then
10579 // we can use any lane.
10580 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10581 // Keep track of the inputs used per lane.
10582 bool LaneV1InUse = false;
10583 bool LaneV2InUse = false;
10584 uint64_t LaneBlendMask = 0;
10585 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10586 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10587 int M = Mask[Elt];
10588 if (M == SM_SentinelUndef)
10589 continue;
10590 if (M == Elt || (0 <= M && M < NumElts &&
10591 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10592 Mask[Elt] = Elt;
10593 LaneV1InUse = true;
10594 continue;
10595 }
10596 if (M == (Elt + NumElts) ||
10597 (NumElts <= M &&
10598 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10599 LaneBlendMask |= 1ull << LaneElt;
10600 Mask[Elt] = Elt + NumElts;
10601 LaneV2InUse = true;
10602 continue;
10603 }
10604 if (Zeroable[Elt]) {
10605 if (V1IsZeroOrUndef) {
10606 ForceV1Zero = true;
10607 Mask[Elt] = Elt;
10608 LaneV1InUse = true;
10609 continue;
10610 }
10611 if (V2IsZeroOrUndef) {
10612 ForceV2Zero = true;
10613 LaneBlendMask |= 1ull << LaneElt;
10614 Mask[Elt] = Elt + NumElts;
10615 LaneV2InUse = true;
10616 continue;
10617 }
10618 }
10619 return false;
10620 }
10621
10622 // If we only used V2 then splat the lane blend mask to avoid any demanded
10623 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10624 // blend mask bit).
10625 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10626 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10627
10628 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10629 }
10630 return true;
10631}
10632
10633/// Try to emit a blend instruction for a shuffle.
10634///
10635/// This doesn't do any checks for the availability of instructions for blending
10636/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10637/// be matched in the backend with the type given. What it does check for is
10638/// that the shuffle mask is a blend, or convertible into a blend with zero.
10640 SDValue V2, ArrayRef<int> Original,
10641 const APInt &Zeroable,
10642 const X86Subtarget &Subtarget,
10643 SelectionDAG &DAG) {
10644 uint64_t BlendMask = 0;
10645 bool ForceV1Zero = false, ForceV2Zero = false;
10646 SmallVector<int, 64> Mask(Original);
10647 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10648 BlendMask))
10649 return SDValue();
10650
10651 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10652 if (ForceV1Zero)
10653 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10654 if (ForceV2Zero)
10655 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10656
10657 unsigned NumElts = VT.getVectorNumElements();
10658
10659 switch (VT.SimpleTy) {
10660 case MVT::v4i64:
10661 case MVT::v8i32:
10662 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10663 [[fallthrough]];
10664 case MVT::v4f64:
10665 case MVT::v8f32:
10666 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10667 [[fallthrough]];
10668 case MVT::v2f64:
10669 case MVT::v2i64:
10670 case MVT::v4f32:
10671 case MVT::v4i32:
10672 case MVT::v8i16:
10673 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10674 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10675 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10676 case MVT::v16i16: {
10677 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10678 SmallVector<int, 8> RepeatedMask;
10679 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10680 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10681 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10682 BlendMask = 0;
10683 for (int i = 0; i < 8; ++i)
10684 if (RepeatedMask[i] >= 8)
10685 BlendMask |= 1ull << i;
10686 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10687 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10688 }
10689 // Use PBLENDW for lower/upper lanes and then blend lanes.
10690 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10691 // merge to VSELECT where useful.
10692 uint64_t LoMask = BlendMask & 0xFF;
10693 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10694 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10695 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10696 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10697 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10698 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10699 return DAG.getVectorShuffle(
10700 MVT::v16i16, DL, Lo, Hi,
10701 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10702 }
10703 [[fallthrough]];
10704 }
10705 case MVT::v32i8:
10706 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10707 [[fallthrough]];
10708 case MVT::v16i8: {
10709 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10710
10711 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10712 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10713 Subtarget, DAG))
10714 return Masked;
10715
10716 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10717 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10718 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10719 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10720 }
10721
10722 // If we have VPTERNLOG, we can use that as a bit blend.
10723 if (Subtarget.hasVLX())
10724 if (SDValue BitBlend =
10725 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10726 return BitBlend;
10727
10728 // Scale the blend by the number of bytes per element.
10729 int Scale = VT.getScalarSizeInBits() / 8;
10730
10731 // This form of blend is always done on bytes. Compute the byte vector
10732 // type.
10733 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10734
10735 // x86 allows load folding with blendvb from the 2nd source operand. But
10736 // we are still using LLVM select here (see comment below), so that's V1.
10737 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
10738 // allow that load-folding possibility.
10739 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
10741 std::swap(V1, V2);
10742 }
10743
10744 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
10745 // mix of LLVM's code generator and the x86 backend. We tell the code
10746 // generator that boolean values in the elements of an x86 vector register
10747 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
10748 // mapping a select to operand #1, and 'false' mapping to operand #2. The
10749 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
10750 // of the element (the remaining are ignored) and 0 in that high bit would
10751 // mean operand #1 while 1 in the high bit would mean operand #2. So while
10752 // the LLVM model for boolean values in vector elements gets the relevant
10753 // bit set, it is set backwards and over constrained relative to x86's
10754 // actual model.
10755 SmallVector<SDValue, 32> VSELECTMask;
10756 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10757 for (int j = 0; j < Scale; ++j)
10758 VSELECTMask.push_back(
10759 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
10760 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
10761 MVT::i8));
10762
10763 V1 = DAG.getBitcast(BlendVT, V1);
10764 V2 = DAG.getBitcast(BlendVT, V2);
10765 return DAG.getBitcast(
10766 VT,
10767 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
10768 V1, V2));
10769 }
10770 case MVT::v16f32:
10771 case MVT::v8f64:
10772 case MVT::v8i64:
10773 case MVT::v16i32:
10774 case MVT::v32i16:
10775 case MVT::v64i8: {
10776 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
10777 bool OptForSize = DAG.shouldOptForSize();
10778 if (!OptForSize) {
10779 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10780 Subtarget, DAG))
10781 return Masked;
10782 }
10783
10784 // Otherwise load an immediate into a GPR, cast to k-register, and use a
10785 // masked move.
10786 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10787 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10788 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10789 }
10790 default:
10791 llvm_unreachable("Not a supported integer vector type!");
10792 }
10793}
10794
10795/// Try to lower as a blend of elements from two inputs followed by
10796/// a single-input permutation.
10797///
10798/// This matches the pattern where we can blend elements from two inputs and
10799/// then reduce the shuffle to a single-input permutation.
10801 SDValue V1, SDValue V2,
10802 ArrayRef<int> Mask,
10803 SelectionDAG &DAG,
10804 bool ImmBlends = false) {
10805 // We build up the blend mask while checking whether a blend is a viable way
10806 // to reduce the shuffle.
10807 SmallVector<int, 32> BlendMask(Mask.size(), -1);
10808 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
10809
10810 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10811 if (Mask[i] < 0)
10812 continue;
10813
10814 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
10815
10816 if (BlendMask[Mask[i] % Size] < 0)
10817 BlendMask[Mask[i] % Size] = Mask[i];
10818 else if (BlendMask[Mask[i] % Size] != Mask[i])
10819 return SDValue(); // Can't blend in the needed input!
10820
10821 PermuteMask[i] = Mask[i] % Size;
10822 }
10823
10824 // If only immediate blends, then bail if the blend mask can't be widened to
10825 // i16.
10826 unsigned EltSize = VT.getScalarSizeInBits();
10827 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
10828 return SDValue();
10829
10830 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
10831 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
10832}
10833
10834/// Try to lower as an unpack of elements from two inputs followed by
10835/// a single-input permutation.
10836///
10837/// This matches the pattern where we can unpack elements from two inputs and
10838/// then reduce the shuffle to a single-input (wider) permutation.
10840 SDValue V1, SDValue V2,
10841 ArrayRef<int> Mask,
10842 SelectionDAG &DAG) {
10843 int NumElts = Mask.size();
10844 int NumLanes = VT.getSizeInBits() / 128;
10845 int NumLaneElts = NumElts / NumLanes;
10846 int NumHalfLaneElts = NumLaneElts / 2;
10847
10848 bool MatchLo = true, MatchHi = true;
10849 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
10850
10851 // Determine UNPCKL/UNPCKH type and operand order.
10852 for (int Elt = 0; Elt != NumElts; ++Elt) {
10853 int M = Mask[Elt];
10854 if (M < 0)
10855 continue;
10856
10857 // Normalize the mask value depending on whether it's V1 or V2.
10858 int NormM = M;
10859 SDValue &Op = Ops[Elt & 1];
10860 if (M < NumElts && (Op.isUndef() || Op == V1))
10861 Op = V1;
10862 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
10863 Op = V2;
10864 NormM -= NumElts;
10865 } else
10866 return SDValue();
10867
10868 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
10869 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
10870 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
10871 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
10872 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
10873 if (MatchLoAnyLane || MatchHiAnyLane) {
10874 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
10875 "Failed to match UNPCKLO/UNPCKHI");
10876 break;
10877 }
10878 }
10879 MatchLo &= MatchLoAnyLane;
10880 MatchHi &= MatchHiAnyLane;
10881 if (!MatchLo && !MatchHi)
10882 return SDValue();
10883 }
10884 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
10885
10886 // Element indices have changed after unpacking. Calculate permute mask
10887 // so that they will be put back to the position as dictated by the
10888 // original shuffle mask indices.
10889 SmallVector<int, 32> PermuteMask(NumElts, -1);
10890 for (int Elt = 0; Elt != NumElts; ++Elt) {
10891 int M = Mask[Elt];
10892 if (M < 0)
10893 continue;
10894 int NormM = M;
10895 if (NumElts <= M)
10896 NormM -= NumElts;
10897 bool IsFirstOp = M < NumElts;
10898 int BaseMaskElt =
10899 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
10900 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
10901 PermuteMask[Elt] = BaseMaskElt;
10902 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
10903 PermuteMask[Elt] = BaseMaskElt + 1;
10904 assert(PermuteMask[Elt] != -1 &&
10905 "Input mask element is defined but failed to assign permute mask");
10906 }
10907
10908 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10909 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
10910 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
10911}
10912
10913/// Try to lower a shuffle as a permute of the inputs followed by an
10914/// UNPCK instruction.
10915///
10916/// This specifically targets cases where we end up with alternating between
10917/// the two inputs, and so can permute them into something that feeds a single
10918/// UNPCK instruction. Note that this routine only targets integer vectors
10919/// because for floating point vectors we have a generalized SHUFPS lowering
10920/// strategy that handles everything that doesn't *exactly* match an unpack,
10921/// making this clever lowering unnecessary.
10923 SDValue V1, SDValue V2,
10924 ArrayRef<int> Mask,
10925 const X86Subtarget &Subtarget,
10926 SelectionDAG &DAG) {
10927 int Size = Mask.size();
10928 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10929
10930 // This routine only supports 128-bit integer dual input vectors.
10931 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
10932 return SDValue();
10933
10934 int NumLoInputs =
10935 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10936 int NumHiInputs =
10937 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10938
10939 bool UnpackLo = NumLoInputs >= NumHiInputs;
10940
10941 auto TryUnpack = [&](int ScalarSize, int Scale) {
10942 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10943 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10944
10945 for (int i = 0; i < Size; ++i) {
10946 if (Mask[i] < 0)
10947 continue;
10948
10949 // Each element of the unpack contains Scale elements from this mask.
10950 int UnpackIdx = i / Scale;
10951
10952 // We only handle the case where V1 feeds the first slots of the unpack.
10953 // We rely on canonicalization to ensure this is the case.
10954 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10955 return SDValue();
10956
10957 // Setup the mask for this input. The indexing is tricky as we have to
10958 // handle the unpack stride.
10959 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10960 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10961 Mask[i] % Size;
10962 }
10963
10964 // If we will have to shuffle both inputs to use the unpack, check whether
10965 // we can just unpack first and shuffle the result. If so, skip this unpack.
10966 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10967 !isNoopShuffleMask(V2Mask))
10968 return SDValue();
10969
10970 // Shuffle the inputs into place.
10971 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10972 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10973
10974 // Cast the inputs to the type we will use to unpack them.
10975 MVT UnpackVT =
10976 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10977 V1 = DAG.getBitcast(UnpackVT, V1);
10978 V2 = DAG.getBitcast(UnpackVT, V2);
10979
10980 // Unpack the inputs and cast the result back to the desired type.
10981 return DAG.getBitcast(
10982 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10983 UnpackVT, V1, V2));
10984 };
10985
10986 // We try each unpack from the largest to the smallest to try and find one
10987 // that fits this mask.
10988 int OrigScalarSize = VT.getScalarSizeInBits();
10989 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10990 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10991 return Unpack;
10992
10993 // If we're shuffling with a zero vector then we're better off not doing
10994 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
10996 ISD::isBuildVectorAllZeros(V2.getNode()))
10997 return SDValue();
10998
10999 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11000 // initial unpack.
11001 if (NumLoInputs == 0 || NumHiInputs == 0) {
11002 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11003 "We have to have *some* inputs!");
11004 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11005
11006 // FIXME: We could consider the total complexity of the permute of each
11007 // possible unpacking. Or at the least we should consider how many
11008 // half-crossings are created.
11009 // FIXME: We could consider commuting the unpacks.
11010
11011 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11012 for (int i = 0; i < Size; ++i) {
11013 if (Mask[i] < 0)
11014 continue;
11015
11016 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11017
11018 PermMask[i] =
11019 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11020 }
11021 return DAG.getVectorShuffle(
11022 VT, DL,
11023 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11024 V1, V2),
11025 DAG.getUNDEF(VT), PermMask);
11026 }
11027
11028 return SDValue();
11029}
11030
11031/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11032/// permuting the elements of the result in place.
11034 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11035 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11036 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11037 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11038 (VT.is512BitVector() && !Subtarget.hasBWI()))
11039 return SDValue();
11040
11041 // We don't currently support lane crossing permutes.
11042 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11043 return SDValue();
11044
11045 int Scale = VT.getScalarSizeInBits() / 8;
11046 int NumLanes = VT.getSizeInBits() / 128;
11047 int NumElts = VT.getVectorNumElements();
11048 int NumEltsPerLane = NumElts / NumLanes;
11049
11050 // Determine range of mask elts.
11051 bool Blend1 = true;
11052 bool Blend2 = true;
11053 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11054 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11055 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11056 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11057 int M = Mask[Lane + Elt];
11058 if (M < 0)
11059 continue;
11060 if (M < NumElts) {
11061 Blend1 &= (M == (Lane + Elt));
11062 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11063 M = M % NumEltsPerLane;
11064 Range1.first = std::min(Range1.first, M);
11065 Range1.second = std::max(Range1.second, M);
11066 } else {
11067 M -= NumElts;
11068 Blend2 &= (M == (Lane + Elt));
11069 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11070 M = M % NumEltsPerLane;
11071 Range2.first = std::min(Range2.first, M);
11072 Range2.second = std::max(Range2.second, M);
11073 }
11074 }
11075 }
11076
11077 // Bail if we don't need both elements.
11078 // TODO - it might be worth doing this for unary shuffles if the permute
11079 // can be widened.
11080 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11081 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11082 return SDValue();
11083
11084 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11085 return SDValue();
11086
11087 // Rotate the 2 ops so we can access both ranges, then permute the result.
11088 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11089 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11090 SDValue Rotate = DAG.getBitcast(
11091 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11092 DAG.getBitcast(ByteVT, Lo),
11093 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11094 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11095 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11096 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11097 int M = Mask[Lane + Elt];
11098 if (M < 0)
11099 continue;
11100 if (M < NumElts)
11101 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11102 else
11103 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11104 }
11105 }
11106 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11107 };
11108
11109 // Check if the ranges are small enough to rotate from either direction.
11110 if (Range2.second < Range1.first)
11111 return RotateAndPermute(V1, V2, Range1.first, 0);
11112 if (Range1.second < Range2.first)
11113 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11114 return SDValue();
11115}
11116
11118 return isUndefOrEqual(Mask, 0);
11119}
11120
11122 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11123}
11124
11125/// Check if the Mask consists of the same element repeated multiple times.
11127 size_t NumUndefs = 0;
11128 std::optional<int> UniqueElt;
11129 for (int Elt : Mask) {
11130 if (Elt == SM_SentinelUndef) {
11131 NumUndefs++;
11132 continue;
11133 }
11134 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11135 return false;
11136 UniqueElt = Elt;
11137 }
11138 // Make sure the element is repeated enough times by checking the number of
11139 // undefs is small.
11140 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11141}
11142
11143/// Generic routine to decompose a shuffle and blend into independent
11144/// blends and permutes.
11145///
11146/// This matches the extremely common pattern for handling combined
11147/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11148/// operations. It will try to pick the best arrangement of shuffles and
11149/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11151 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11152 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11153 int NumElts = Mask.size();
11154 int NumLanes = VT.getSizeInBits() / 128;
11155 int NumEltsPerLane = NumElts / NumLanes;
11156
11157 // Shuffle the input elements into the desired positions in V1 and V2 and
11158 // unpack/blend them together.
11159 bool IsAlternating = true;
11160 SmallVector<int, 32> V1Mask(NumElts, -1);
11161 SmallVector<int, 32> V2Mask(NumElts, -1);
11162 SmallVector<int, 32> FinalMask(NumElts, -1);
11163 for (int i = 0; i < NumElts; ++i) {
11164 int M = Mask[i];
11165 if (M >= 0 && M < NumElts) {
11166 V1Mask[i] = M;
11167 FinalMask[i] = i;
11168 IsAlternating &= (i & 1) == 0;
11169 } else if (M >= NumElts) {
11170 V2Mask[i] = M - NumElts;
11171 FinalMask[i] = i + NumElts;
11172 IsAlternating &= (i & 1) == 1;
11173 }
11174 }
11175
11176 // If we effectively only demand the 0'th element of \p Input, and not only
11177 // as 0'th element, then broadcast said input,
11178 // and change \p InputMask to be a no-op (identity) mask.
11179 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11180 &DAG](SDValue &Input,
11181 MutableArrayRef<int> InputMask) {
11182 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11183 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11184 !X86::mayFoldLoad(Input, Subtarget)))
11185 return;
11186 if (isNoopShuffleMask(InputMask))
11187 return;
11188 assert(isBroadcastShuffleMask(InputMask) &&
11189 "Expected to demand only the 0'th element.");
11190 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11191 for (auto I : enumerate(InputMask)) {
11192 int &InputMaskElt = I.value();
11193 if (InputMaskElt >= 0)
11194 InputMaskElt = I.index();
11195 }
11196 };
11197
11198 // Currently, we may need to produce one shuffle per input, and blend results.
11199 // It is possible that the shuffle for one of the inputs is already a no-op.
11200 // See if we can simplify non-no-op shuffles into broadcasts,
11201 // which we consider to be strictly better than an arbitrary shuffle.
11202 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11204 canonicalizeBroadcastableInput(V1, V1Mask);
11205 canonicalizeBroadcastableInput(V2, V2Mask);
11206 }
11207
11208 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11209 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11210 // the shuffle may be able to fold with a load or other benefit. However, when
11211 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11212 // pre-shuffle first is a better strategy.
11213 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11214 // Only prefer immediate blends to unpack/rotate.
11215 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11216 DAG, true))
11217 return BlendPerm;
11218 // If either input vector provides only a single element which is repeated
11219 // multiple times, unpacking from both input vectors would generate worse
11220 // code. e.g. for
11221 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11222 // it is better to process t4 first to create a vector of t4[0], then unpack
11223 // that vector with t2.
11224 if (!isSingleElementRepeatedMask(V1Mask) &&
11226 if (SDValue UnpackPerm =
11227 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11228 return UnpackPerm;
11230 DL, VT, V1, V2, Mask, Subtarget, DAG))
11231 return RotatePerm;
11232 // Unpack/rotate failed - try again with variable blends.
11233 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11234 DAG))
11235 return BlendPerm;
11236 if (VT.getScalarSizeInBits() >= 32)
11237 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11238 DL, VT, V1, V2, Mask, Subtarget, DAG))
11239 return PermUnpack;
11240 }
11241
11242 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11243 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11244 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11245 // than half the elements coming from each source.
11246 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11247 V1Mask.assign(NumElts, -1);
11248 V2Mask.assign(NumElts, -1);
11249 FinalMask.assign(NumElts, -1);
11250 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11251 for (int j = 0; j != NumEltsPerLane; ++j) {
11252 int M = Mask[i + j];
11253 if (M >= 0 && M < NumElts) {
11254 V1Mask[i + (j / 2)] = M;
11255 FinalMask[i + j] = i + (j / 2);
11256 } else if (M >= NumElts) {
11257 V2Mask[i + (j / 2)] = M - NumElts;
11258 FinalMask[i + j] = i + (j / 2) + NumElts;
11259 }
11260 }
11261 }
11262
11263 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11264 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11265 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11266}
11267
11268static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11269 const X86Subtarget &Subtarget,
11270 ArrayRef<int> Mask) {
11271 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11272 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11273
11274 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11275 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11276 int MaxSubElts = 64 / EltSizeInBits;
11277 unsigned RotateAmt, NumSubElts;
11278 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11279 MaxSubElts, NumSubElts, RotateAmt))
11280 return -1;
11281 unsigned NumElts = Mask.size();
11282 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11283 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11284 return RotateAmt;
11285}
11286
11287/// Lower shuffle using X86ISD::VROTLI rotations.
11289 ArrayRef<int> Mask,
11290 const X86Subtarget &Subtarget,
11291 SelectionDAG &DAG) {
11292 // Only XOP + AVX512 targets have bit rotation instructions.
11293 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11294 bool IsLegal =
11295 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11296 if (!IsLegal && Subtarget.hasSSE3())
11297 return SDValue();
11298
11299 MVT RotateVT;
11300 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11301 Subtarget, Mask);
11302 if (RotateAmt < 0)
11303 return SDValue();
11304
11305 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11306 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11307 // widen to vXi16 or more then existing lowering should will be better.
11308 if (!IsLegal) {
11309 if ((RotateAmt % 16) == 0)
11310 return SDValue();
11311 // TODO: Use getTargetVShiftByConstNode.
11312 unsigned ShlAmt = RotateAmt;
11313 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11314 V1 = DAG.getBitcast(RotateVT, V1);
11315 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11316 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11317 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11318 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11319 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11320 return DAG.getBitcast(VT, Rot);
11321 }
11322
11323 SDValue Rot =
11324 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11325 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11326 return DAG.getBitcast(VT, Rot);
11327}
11328
11329/// Try to match a vector shuffle as an element rotation.
11330///
11331/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11333 ArrayRef<int> Mask) {
11334 int NumElts = Mask.size();
11335
11336 // We need to detect various ways of spelling a rotation:
11337 // [11, 12, 13, 14, 15, 0, 1, 2]
11338 // [-1, 12, 13, 14, -1, -1, 1, -1]
11339 // [-1, -1, -1, -1, -1, -1, 1, 2]
11340 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11341 // [-1, 4, 5, 6, -1, -1, 9, -1]
11342 // [-1, 4, 5, 6, -1, -1, -1, -1]
11343 int Rotation = 0;
11344 SDValue Lo, Hi;
11345 for (int i = 0; i < NumElts; ++i) {
11346 int M = Mask[i];
11347 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11348 "Unexpected mask index.");
11349 if (M < 0)
11350 continue;
11351
11352 // Determine where a rotated vector would have started.
11353 int StartIdx = i - (M % NumElts);
11354 if (StartIdx == 0)
11355 // The identity rotation isn't interesting, stop.
11356 return -1;
11357
11358 // If we found the tail of a vector the rotation must be the missing
11359 // front. If we found the head of a vector, it must be how much of the
11360 // head.
11361 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11362
11363 if (Rotation == 0)
11364 Rotation = CandidateRotation;
11365 else if (Rotation != CandidateRotation)
11366 // The rotations don't match, so we can't match this mask.
11367 return -1;
11368
11369 // Compute which value this mask is pointing at.
11370 SDValue MaskV = M < NumElts ? V1 : V2;
11371
11372 // Compute which of the two target values this index should be assigned
11373 // to. This reflects whether the high elements are remaining or the low
11374 // elements are remaining.
11375 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11376
11377 // Either set up this value if we've not encountered it before, or check
11378 // that it remains consistent.
11379 if (!TargetV)
11380 TargetV = MaskV;
11381 else if (TargetV != MaskV)
11382 // This may be a rotation, but it pulls from the inputs in some
11383 // unsupported interleaving.
11384 return -1;
11385 }
11386
11387 // Check that we successfully analyzed the mask, and normalize the results.
11388 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11389 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11390 if (!Lo)
11391 Lo = Hi;
11392 else if (!Hi)
11393 Hi = Lo;
11394
11395 V1 = Lo;
11396 V2 = Hi;
11397
11398 return Rotation;
11399}
11400
11401/// Try to lower a vector shuffle as a byte rotation.
11402///
11403/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11404/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11405/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11406/// try to generically lower a vector shuffle through such an pattern. It
11407/// does not check for the profitability of lowering either as PALIGNR or
11408/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11409/// This matches shuffle vectors that look like:
11410///
11411/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11412///
11413/// Essentially it concatenates V1 and V2, shifts right by some number of
11414/// elements, and takes the low elements as the result. Note that while this is
11415/// specified as a *right shift* because x86 is little-endian, it is a *left
11416/// rotate* of the vector lanes.
11418 ArrayRef<int> Mask) {
11419 // Don't accept any shuffles with zero elements.
11420 if (isAnyZero(Mask))
11421 return -1;
11422
11423 // PALIGNR works on 128-bit lanes.
11424 SmallVector<int, 16> RepeatedMask;
11425 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11426 return -1;
11427
11428 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11429 if (Rotation <= 0)
11430 return -1;
11431
11432 // PALIGNR rotates bytes, so we need to scale the
11433 // rotation based on how many bytes are in the vector lane.
11434 int NumElts = RepeatedMask.size();
11435 int Scale = 16 / NumElts;
11436 return Rotation * Scale;
11437}
11438
11440 SDValue V2, ArrayRef<int> Mask,
11441 const X86Subtarget &Subtarget,
11442 SelectionDAG &DAG) {
11443 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11444
11445 SDValue Lo = V1, Hi = V2;
11446 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11447 if (ByteRotation <= 0)
11448 return SDValue();
11449
11450 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11451 // PSLLDQ/PSRLDQ.
11452 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11453 Lo = DAG.getBitcast(ByteVT, Lo);
11454 Hi = DAG.getBitcast(ByteVT, Hi);
11455
11456 // SSSE3 targets can use the palignr instruction.
11457 if (Subtarget.hasSSSE3()) {
11458 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11459 "512-bit PALIGNR requires BWI instructions");
11460 return DAG.getBitcast(
11461 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11462 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11463 }
11464
11465 assert(VT.is128BitVector() &&
11466 "Rotate-based lowering only supports 128-bit lowering!");
11467 assert(Mask.size() <= 16 &&
11468 "Can shuffle at most 16 bytes in a 128-bit vector!");
11469 assert(ByteVT == MVT::v16i8 &&
11470 "SSE2 rotate lowering only needed for v16i8!");
11471
11472 // Default SSE2 implementation
11473 int LoByteShift = 16 - ByteRotation;
11474 int HiByteShift = ByteRotation;
11475
11476 SDValue LoShift =
11477 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11478 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11479 SDValue HiShift =
11480 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11481 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11482 return DAG.getBitcast(VT,
11483 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11484}
11485
11486/// Try to lower a vector shuffle as a dword/qword rotation.
11487///
11488/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11489/// rotation of the concatenation of two vectors; This routine will
11490/// try to generically lower a vector shuffle through such an pattern.
11491///
11492/// Essentially it concatenates V1 and V2, shifts right by some number of
11493/// elements, and takes the low elements as the result. Note that while this is
11494/// specified as a *right shift* because x86 is little-endian, it is a *left
11495/// rotate* of the vector lanes.
11497 SDValue V2, ArrayRef<int> Mask,
11498 const APInt &Zeroable,
11499 const X86Subtarget &Subtarget,
11500 SelectionDAG &DAG) {
11501 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11502 "Only 32-bit and 64-bit elements are supported!");
11503
11504 // 128/256-bit vectors are only supported with VLX.
11505 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11506 && "VLX required for 128/256-bit vectors");
11507
11508 SDValue Lo = V1, Hi = V2;
11509 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11510 if (0 < Rotation)
11511 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11512 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11513
11514 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11515 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11516 // TODO: We can probably make this more aggressive and use shift-pairs like
11517 // lowerShuffleAsByteShiftMask.
11518 unsigned NumElts = Mask.size();
11519 unsigned ZeroLo = Zeroable.countr_one();
11520 unsigned ZeroHi = Zeroable.countl_one();
11521 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11522 if (!ZeroLo && !ZeroHi)
11523 return SDValue();
11524
11525 if (ZeroLo) {
11526 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11527 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11528 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11529 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11530 getZeroVector(VT, Subtarget, DAG, DL),
11531 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11532 }
11533
11534 if (ZeroHi) {
11535 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11536 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11537 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11538 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11539 getZeroVector(VT, Subtarget, DAG, DL), Src,
11540 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11541 }
11542
11543 return SDValue();
11544}
11545
11546/// Try to lower a vector shuffle as a byte shift sequence.
11548 SDValue V2, ArrayRef<int> Mask,
11549 const APInt &Zeroable,
11550 const X86Subtarget &Subtarget,
11551 SelectionDAG &DAG) {
11552 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11553 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11554
11555 // We need a shuffle that has zeros at one/both ends and a sequential
11556 // shuffle from one source within.
11557 unsigned ZeroLo = Zeroable.countr_one();
11558 unsigned ZeroHi = Zeroable.countl_one();
11559 if (!ZeroLo && !ZeroHi)
11560 return SDValue();
11561
11562 unsigned NumElts = Mask.size();
11563 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11564 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11565 return SDValue();
11566
11567 unsigned Scale = VT.getScalarSizeInBits() / 8;
11568 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11569 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11570 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11571 return SDValue();
11572
11573 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11574 Res = DAG.getBitcast(MVT::v16i8, Res);
11575
11576 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11577 // inner sequential set of elements, possibly offset:
11578 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11579 // 01234567 --> 4567zzzz --> zzzzz456
11580 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11581 if (ZeroLo == 0) {
11582 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11583 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11584 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11585 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11586 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11587 } else if (ZeroHi == 0) {
11588 unsigned Shift = Mask[ZeroLo] % NumElts;
11589 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11590 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11591 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11592 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11593 } else if (!Subtarget.hasSSSE3()) {
11594 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11595 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11596 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11597 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11598 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11599 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11600 Shift += Mask[ZeroLo] % NumElts;
11601 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11602 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11603 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11604 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11605 } else
11606 return SDValue();
11607
11608 return DAG.getBitcast(VT, Res);
11609}
11610
11611/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11612///
11613/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11614/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11615/// matches elements from one of the input vectors shuffled to the left or
11616/// right with zeroable elements 'shifted in'. It handles both the strictly
11617/// bit-wise element shifts and the byte shift across an entire 128-bit double
11618/// quad word lane.
11619///
11620/// PSHL : (little-endian) left bit shift.
11621/// [ zz, 0, zz, 2 ]
11622/// [ -1, 4, zz, -1 ]
11623/// PSRL : (little-endian) right bit shift.
11624/// [ 1, zz, 3, zz]
11625/// [ -1, -1, 7, zz]
11626/// PSLLDQ : (little-endian) left byte shift
11627/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11628/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11629/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11630/// PSRLDQ : (little-endian) right byte shift
11631/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11632/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11633/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11634static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11635 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11636 int MaskOffset, const APInt &Zeroable,
11637 const X86Subtarget &Subtarget) {
11638 int Size = Mask.size();
11639 unsigned SizeInBits = Size * ScalarSizeInBits;
11640
11641 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11642 for (int i = 0; i < Size; i += Scale)
11643 for (int j = 0; j < Shift; ++j)
11644 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11645 return false;
11646
11647 return true;
11648 };
11649
11650 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11651 for (int i = 0; i != Size; i += Scale) {
11652 unsigned Pos = Left ? i + Shift : i;
11653 unsigned Low = Left ? i : i + Shift;
11654 unsigned Len = Scale - Shift;
11655 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11656 return -1;
11657 }
11658
11659 int ShiftEltBits = ScalarSizeInBits * Scale;
11660 bool ByteShift = ShiftEltBits > 64;
11661 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11662 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11663 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11664
11665 // Normalize the scale for byte shifts to still produce an i64 element
11666 // type.
11667 Scale = ByteShift ? Scale / 2 : Scale;
11668
11669 // We need to round trip through the appropriate type for the shift.
11670 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11671 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11672 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11673 return (int)ShiftAmt;
11674 };
11675
11676 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11677 // keep doubling the size of the integer elements up to that. We can
11678 // then shift the elements of the integer vector by whole multiples of
11679 // their width within the elements of the larger integer vector. Test each
11680 // multiple to see if we can find a match with the moved element indices
11681 // and that the shifted in elements are all zeroable.
11682 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11683 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11684 for (int Shift = 1; Shift != Scale; ++Shift)
11685 for (bool Left : {true, false})
11686 if (CheckZeros(Shift, Scale, Left)) {
11687 int ShiftAmt = MatchShift(Shift, Scale, Left);
11688 if (0 < ShiftAmt)
11689 return ShiftAmt;
11690 }
11691
11692 // no match
11693 return -1;
11694}
11695
11697 SDValue V2, ArrayRef<int> Mask,
11698 const APInt &Zeroable,
11699 const X86Subtarget &Subtarget,
11700 SelectionDAG &DAG, bool BitwiseOnly) {
11701 int Size = Mask.size();
11702 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11703
11704 MVT ShiftVT;
11705 SDValue V = V1;
11706 unsigned Opcode;
11707
11708 // Try to match shuffle against V1 shift.
11709 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11710 Mask, 0, Zeroable, Subtarget);
11711
11712 // If V1 failed, try to match shuffle against V2 shift.
11713 if (ShiftAmt < 0) {
11714 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11715 Mask, Size, Zeroable, Subtarget);
11716 V = V2;
11717 }
11718
11719 if (ShiftAmt < 0)
11720 return SDValue();
11721
11722 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11723 return SDValue();
11724
11725 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11726 "Illegal integer vector type");
11727 V = DAG.getBitcast(ShiftVT, V);
11728 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11729 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11730 return DAG.getBitcast(VT, V);
11731}
11732
11733// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
11734// Remainder of lower half result is zero and upper half is all undef.
11735static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
11736 ArrayRef<int> Mask, uint64_t &BitLen,
11737 uint64_t &BitIdx, const APInt &Zeroable) {
11738 int Size = Mask.size();
11739 int HalfSize = Size / 2;
11740 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11741 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
11742
11743 // Upper half must be undefined.
11744 if (!isUndefUpperHalf(Mask))
11745 return false;
11746
11747 // Determine the extraction length from the part of the
11748 // lower half that isn't zeroable.
11749 int Len = HalfSize;
11750 for (; Len > 0; --Len)
11751 if (!Zeroable[Len - 1])
11752 break;
11753 assert(Len > 0 && "Zeroable shuffle mask");
11754
11755 // Attempt to match first Len sequential elements from the lower half.
11756 SDValue Src;
11757 int Idx = -1;
11758 for (int i = 0; i != Len; ++i) {
11759 int M = Mask[i];
11760 if (M == SM_SentinelUndef)
11761 continue;
11762 SDValue &V = (M < Size ? V1 : V2);
11763 M = M % Size;
11764
11765 // The extracted elements must start at a valid index and all mask
11766 // elements must be in the lower half.
11767 if (i > M || M >= HalfSize)
11768 return false;
11769
11770 if (Idx < 0 || (Src == V && Idx == (M - i))) {
11771 Src = V;
11772 Idx = M - i;
11773 continue;
11774 }
11775 return false;
11776 }
11777
11778 if (!Src || Idx < 0)
11779 return false;
11780
11781 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
11782 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11783 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11784 V1 = Src;
11785 return true;
11786}
11787
11788// INSERTQ: Extract lowest Len elements from lower half of second source and
11789// insert over first source, starting at Idx.
11790// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
11791static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
11792 ArrayRef<int> Mask, uint64_t &BitLen,
11793 uint64_t &BitIdx) {
11794 int Size = Mask.size();
11795 int HalfSize = Size / 2;
11796 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11797
11798 // Upper half must be undefined.
11799 if (!isUndefUpperHalf(Mask))
11800 return false;
11801
11802 for (int Idx = 0; Idx != HalfSize; ++Idx) {
11803 SDValue Base;
11804
11805 // Attempt to match first source from mask before insertion point.
11806 if (isUndefInRange(Mask, 0, Idx)) {
11807 /* EMPTY */
11808 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
11809 Base = V1;
11810 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
11811 Base = V2;
11812 } else {
11813 continue;
11814 }
11815
11816 // Extend the extraction length looking to match both the insertion of
11817 // the second source and the remaining elements of the first.
11818 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
11819 SDValue Insert;
11820 int Len = Hi - Idx;
11821
11822 // Match insertion.
11823 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
11824 Insert = V1;
11825 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
11826 Insert = V2;
11827 } else {
11828 continue;
11829 }
11830
11831 // Match the remaining elements of the lower half.
11832 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
11833 /* EMPTY */
11834 } else if ((!Base || (Base == V1)) &&
11835 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
11836 Base = V1;
11837 } else if ((!Base || (Base == V2)) &&
11838 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
11839 Size + Hi)) {
11840 Base = V2;
11841 } else {
11842 continue;
11843 }
11844
11845 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11846 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11847 V1 = Base;
11848 V2 = Insert;
11849 return true;
11850 }
11851 }
11852
11853 return false;
11854}
11855
11856/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
11858 SDValue V2, ArrayRef<int> Mask,
11859 const APInt &Zeroable, SelectionDAG &DAG) {
11860 uint64_t BitLen, BitIdx;
11861 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
11862 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
11863 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11864 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11865
11866 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
11867 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
11868 V2 ? V2 : DAG.getUNDEF(VT),
11869 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11870 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11871
11872 return SDValue();
11873}
11874
11875/// Lower a vector shuffle as a zero or any extension.
11876///
11877/// Given a specific number of elements, element bit width, and extension
11878/// stride, produce either a zero or any extension based on the available
11879/// features of the subtarget. The extended elements are consecutive and
11880/// begin and can start from an offsetted element index in the input; to
11881/// avoid excess shuffling the offset must either being in the bottom lane
11882/// or at the start of a higher lane. All extended elements must be from
11883/// the same lane.
11885 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
11886 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11887 assert(Scale > 1 && "Need a scale to extend.");
11888 int EltBits = VT.getScalarSizeInBits();
11889 int NumElements = VT.getVectorNumElements();
11890 int NumEltsPerLane = 128 / EltBits;
11891 int OffsetLane = Offset / NumEltsPerLane;
11892 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
11893 "Only 8, 16, and 32 bit elements can be extended.");
11894 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
11895 assert(0 <= Offset && "Extension offset must be positive.");
11896 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
11897 "Extension offset must be in the first lane or start an upper lane.");
11898
11899 // Check that an index is in same lane as the base offset.
11900 auto SafeOffset = [&](int Idx) {
11901 return OffsetLane == (Idx / NumEltsPerLane);
11902 };
11903
11904 // Shift along an input so that the offset base moves to the first element.
11905 auto ShuffleOffset = [&](SDValue V) {
11906 if (!Offset)
11907 return V;
11908
11909 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11910 for (int i = 0; i * Scale < NumElements; ++i) {
11911 int SrcIdx = i + Offset;
11912 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
11913 }
11914 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
11915 };
11916
11917 // Found a valid a/zext mask! Try various lowering strategies based on the
11918 // input type and available ISA extensions.
11919 if (Subtarget.hasSSE41()) {
11920 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
11921 // PUNPCK will catch this in a later shuffle match.
11922 if (Offset && Scale == 2 && VT.is128BitVector())
11923 return SDValue();
11924 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
11925 NumElements / Scale);
11926 InputV = DAG.getBitcast(VT, InputV);
11927 InputV = ShuffleOffset(InputV);
11929 DL, ExtVT, InputV, DAG);
11930 return DAG.getBitcast(VT, InputV);
11931 }
11932
11933 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
11934 InputV = DAG.getBitcast(VT, InputV);
11935
11936 // For any extends we can cheat for larger element sizes and use shuffle
11937 // instructions that can fold with a load and/or copy.
11938 if (AnyExt && EltBits == 32) {
11939 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
11940 -1};
11941 return DAG.getBitcast(
11942 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11943 DAG.getBitcast(MVT::v4i32, InputV),
11944 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11945 }
11946 if (AnyExt && EltBits == 16 && Scale > 2) {
11947 int PSHUFDMask[4] = {Offset / 2, -1,
11948 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
11949 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11950 DAG.getBitcast(MVT::v4i32, InputV),
11951 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11952 int PSHUFWMask[4] = {1, -1, -1, -1};
11953 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
11954 return DAG.getBitcast(
11955 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
11956 DAG.getBitcast(MVT::v8i16, InputV),
11957 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
11958 }
11959
11960 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
11961 // to 64-bits.
11962 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
11963 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
11964 assert(VT.is128BitVector() && "Unexpected vector width!");
11965
11966 int LoIdx = Offset * EltBits;
11967 SDValue Lo = DAG.getBitcast(
11968 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11969 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11970 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
11971
11972 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
11973 return DAG.getBitcast(VT, Lo);
11974
11975 int HiIdx = (Offset + 1) * EltBits;
11976 SDValue Hi = DAG.getBitcast(
11977 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11978 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11979 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
11980 return DAG.getBitcast(VT,
11981 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
11982 }
11983
11984 // If this would require more than 2 unpack instructions to expand, use
11985 // pshufb when available. We can only use more than 2 unpack instructions
11986 // when zero extending i8 elements which also makes it easier to use pshufb.
11987 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
11988 assert(NumElements == 16 && "Unexpected byte vector width!");
11989 SDValue PSHUFBMask[16];
11990 for (int i = 0; i < 16; ++i) {
11991 int Idx = Offset + (i / Scale);
11992 if ((i % Scale == 0 && SafeOffset(Idx))) {
11993 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
11994 continue;
11995 }
11996 PSHUFBMask[i] =
11997 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
11998 }
11999 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12000 return DAG.getBitcast(
12001 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12002 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12003 }
12004
12005 // If we are extending from an offset, ensure we start on a boundary that
12006 // we can unpack from.
12007 int AlignToUnpack = Offset % (NumElements / Scale);
12008 if (AlignToUnpack) {
12009 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12010 for (int i = AlignToUnpack; i < NumElements; ++i)
12011 ShMask[i - AlignToUnpack] = i;
12012 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12013 Offset -= AlignToUnpack;
12014 }
12015
12016 // Otherwise emit a sequence of unpacks.
12017 do {
12018 unsigned UnpackLoHi = X86ISD::UNPCKL;
12019 if (Offset >= (NumElements / 2)) {
12020 UnpackLoHi = X86ISD::UNPCKH;
12021 Offset -= (NumElements / 2);
12022 }
12023
12024 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12025 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12026 : getZeroVector(InputVT, Subtarget, DAG, DL);
12027 InputV = DAG.getBitcast(InputVT, InputV);
12028 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12029 Scale /= 2;
12030 EltBits *= 2;
12031 NumElements /= 2;
12032 } while (Scale > 1);
12033 return DAG.getBitcast(VT, InputV);
12034}
12035
12036/// Try to lower a vector shuffle as a zero extension on any microarch.
12037///
12038/// This routine will try to do everything in its power to cleverly lower
12039/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12040/// check for the profitability of this lowering, it tries to aggressively
12041/// match this pattern. It will use all of the micro-architectural details it
12042/// can to emit an efficient lowering. It handles both blends with all-zero
12043/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12044/// masking out later).
12045///
12046/// The reason we have dedicated lowering for zext-style shuffles is that they
12047/// are both incredibly common and often quite performance sensitive.
12049 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12050 const APInt &Zeroable, const X86Subtarget &Subtarget,
12051 SelectionDAG &DAG) {
12052 int Bits = VT.getSizeInBits();
12053 int NumLanes = Bits / 128;
12054 int NumElements = VT.getVectorNumElements();
12055 int NumEltsPerLane = NumElements / NumLanes;
12056 assert(VT.getScalarSizeInBits() <= 32 &&
12057 "Exceeds 32-bit integer zero extension limit");
12058 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12059
12060 // Define a helper function to check a particular ext-scale and lower to it if
12061 // valid.
12062 auto Lower = [&](int Scale) -> SDValue {
12063 SDValue InputV;
12064 bool AnyExt = true;
12065 int Offset = 0;
12066 int Matches = 0;
12067 for (int i = 0; i < NumElements; ++i) {
12068 int M = Mask[i];
12069 if (M < 0)
12070 continue; // Valid anywhere but doesn't tell us anything.
12071 if (i % Scale != 0) {
12072 // Each of the extended elements need to be zeroable.
12073 if (!Zeroable[i])
12074 return SDValue();
12075
12076 // We no longer are in the anyext case.
12077 AnyExt = false;
12078 continue;
12079 }
12080
12081 // Each of the base elements needs to be consecutive indices into the
12082 // same input vector.
12083 SDValue V = M < NumElements ? V1 : V2;
12084 M = M % NumElements;
12085 if (!InputV) {
12086 InputV = V;
12087 Offset = M - (i / Scale);
12088 } else if (InputV != V)
12089 return SDValue(); // Flip-flopping inputs.
12090
12091 // Offset must start in the lowest 128-bit lane or at the start of an
12092 // upper lane.
12093 // FIXME: Is it ever worth allowing a negative base offset?
12094 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12095 (Offset % NumEltsPerLane) == 0))
12096 return SDValue();
12097
12098 // If we are offsetting, all referenced entries must come from the same
12099 // lane.
12100 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12101 return SDValue();
12102
12103 if ((M % NumElements) != (Offset + (i / Scale)))
12104 return SDValue(); // Non-consecutive strided elements.
12105 Matches++;
12106 }
12107
12108 // If we fail to find an input, we have a zero-shuffle which should always
12109 // have already been handled.
12110 // FIXME: Maybe handle this here in case during blending we end up with one?
12111 if (!InputV)
12112 return SDValue();
12113
12114 // If we are offsetting, don't extend if we only match a single input, we
12115 // can always do better by using a basic PSHUF or PUNPCK.
12116 if (Offset != 0 && Matches < 2)
12117 return SDValue();
12118
12119 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12120 InputV, Mask, Subtarget, DAG);
12121 };
12122
12123 // The widest scale possible for extending is to a 64-bit integer.
12124 assert(Bits % 64 == 0 &&
12125 "The number of bits in a vector must be divisible by 64 on x86!");
12126 int NumExtElements = Bits / 64;
12127
12128 // Each iteration, try extending the elements half as much, but into twice as
12129 // many elements.
12130 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12131 assert(NumElements % NumExtElements == 0 &&
12132 "The input vector size must be divisible by the extended size.");
12133 if (SDValue V = Lower(NumElements / NumExtElements))
12134 return V;
12135 }
12136
12137 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12138 if (Bits != 128)
12139 return SDValue();
12140
12141 // Returns one of the source operands if the shuffle can be reduced to a
12142 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12143 auto CanZExtLowHalf = [&]() {
12144 for (int i = NumElements / 2; i != NumElements; ++i)
12145 if (!Zeroable[i])
12146 return SDValue();
12147 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12148 return V1;
12149 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12150 return V2;
12151 return SDValue();
12152 };
12153
12154 if (SDValue V = CanZExtLowHalf()) {
12155 V = DAG.getBitcast(MVT::v2i64, V);
12156 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12157 return DAG.getBitcast(VT, V);
12158 }
12159
12160 // No viable ext lowering found.
12161 return SDValue();
12162}
12163
12164/// Try to get a scalar value for a specific element of a vector.
12165///
12166/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12168 SelectionDAG &DAG) {
12169 MVT VT = V.getSimpleValueType();
12170 MVT EltVT = VT.getVectorElementType();
12171 V = peekThroughBitcasts(V);
12172
12173 // If the bitcasts shift the element size, we can't extract an equivalent
12174 // element from it.
12175 MVT NewVT = V.getSimpleValueType();
12176 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12177 return SDValue();
12178
12179 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12180 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12181 // Ensure the scalar operand is the same size as the destination.
12182 // FIXME: Add support for scalar truncation where possible.
12183 SDValue S = V.getOperand(Idx);
12184 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12185 return DAG.getBitcast(EltVT, S);
12186 }
12187
12188 return SDValue();
12189}
12190
12191/// Helper to test for a load that can be folded with x86 shuffles.
12192///
12193/// This is particularly important because the set of instructions varies
12194/// significantly based on whether the operand is a load or not.
12196 return V->hasOneUse() &&
12198}
12199
12200template<typename T>
12201static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12202 T EltVT = VT.getScalarType();
12203 return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
12204}
12205
12206/// Try to lower insertion of a single element into a zero vector.
12207///
12208/// This is a common pattern that we have especially efficient patterns to lower
12209/// across all subtarget feature sets.
12211 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12212 const APInt &Zeroable, const X86Subtarget &Subtarget,
12213 SelectionDAG &DAG) {
12214 MVT ExtVT = VT;
12215 MVT EltVT = VT.getVectorElementType();
12216 unsigned NumElts = VT.getVectorNumElements();
12217 unsigned EltBits = VT.getScalarSizeInBits();
12218
12219 if (isSoftF16(EltVT, Subtarget))
12220 return SDValue();
12221
12222 int V2Index =
12223 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12224 Mask.begin();
12225 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12226 bool IsV1Zeroable = true;
12227 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12228 if (i != V2Index && !Zeroable[i]) {
12229 IsV1Zeroable = false;
12230 break;
12231 }
12232
12233 // Bail if a non-zero V1 isn't used in place.
12234 if (!IsV1Zeroable) {
12235 SmallVector<int, 8> V1Mask(Mask);
12236 V1Mask[V2Index] = -1;
12237 if (!isNoopShuffleMask(V1Mask))
12238 return SDValue();
12239 }
12240
12241 // Check for a single input from a SCALAR_TO_VECTOR node.
12242 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12243 // all the smarts here sunk into that routine. However, the current
12244 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12245 // vector shuffle lowering is dead.
12246 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12247 DAG);
12248 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12249 // We need to zext the scalar if it is smaller than an i32.
12250 V2S = DAG.getBitcast(EltVT, V2S);
12251 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12252 // Using zext to expand a narrow element won't work for non-zero
12253 // insertions. But we can use a masked constant vector if we're
12254 // inserting V2 into the bottom of V1.
12255 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12256 return SDValue();
12257
12258 // Zero-extend directly to i32.
12259 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12260 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12261
12262 // If we're inserting into a constant, mask off the inserted index
12263 // and OR with the zero-extended scalar.
12264 if (!IsV1Zeroable) {
12265 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12266 Bits[V2Index] = APInt::getZero(EltBits);
12267 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12268 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12269 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12270 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12271 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12272 }
12273 }
12274 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12275 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12276 EltVT == MVT::i16) {
12277 // Either not inserting from the low element of the input or the input
12278 // element size is too small to use VZEXT_MOVL to clear the high bits.
12279 return SDValue();
12280 }
12281
12282 if (!IsV1Zeroable) {
12283 // If V1 can't be treated as a zero vector we have fewer options to lower
12284 // this. We can't support integer vectors or non-zero targets cheaply.
12285 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12286 if (!VT.isFloatingPoint() || V2Index != 0)
12287 return SDValue();
12288 if (!VT.is128BitVector())
12289 return SDValue();
12290
12291 // Otherwise, use MOVSD, MOVSS or MOVSH.
12292 unsigned MovOpc = 0;
12293 if (EltVT == MVT::f16)
12294 MovOpc = X86ISD::MOVSH;
12295 else if (EltVT == MVT::f32)
12296 MovOpc = X86ISD::MOVSS;
12297 else if (EltVT == MVT::f64)
12298 MovOpc = X86ISD::MOVSD;
12299 else
12300 llvm_unreachable("Unsupported floating point element type to handle!");
12301 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12302 }
12303
12304 // This lowering only works for the low element with floating point vectors.
12305 if (VT.isFloatingPoint() && V2Index != 0)
12306 return SDValue();
12307
12308 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12309 if (ExtVT != VT)
12310 V2 = DAG.getBitcast(VT, V2);
12311
12312 if (V2Index != 0) {
12313 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12314 // the desired position. Otherwise it is more efficient to do a vector
12315 // shift left. We know that we can do a vector shift left because all
12316 // the inputs are zero.
12317 if (VT.isFloatingPoint() || NumElts <= 4) {
12318 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12319 V2Shuffle[V2Index] = 0;
12320 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12321 } else {
12322 V2 = DAG.getBitcast(MVT::v16i8, V2);
12323 V2 = DAG.getNode(
12324 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12325 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12326 V2 = DAG.getBitcast(VT, V2);
12327 }
12328 }
12329 return V2;
12330}
12331
12332/// Try to lower broadcast of a single - truncated - integer element,
12333/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12334///
12335/// This assumes we have AVX2.
12337 int BroadcastIdx,
12338 const X86Subtarget &Subtarget,
12339 SelectionDAG &DAG) {
12340 assert(Subtarget.hasAVX2() &&
12341 "We can only lower integer broadcasts with AVX2!");
12342
12343 MVT EltVT = VT.getVectorElementType();
12344 MVT V0VT = V0.getSimpleValueType();
12345
12346 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12347 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12348
12349 MVT V0EltVT = V0VT.getVectorElementType();
12350 if (!V0EltVT.isInteger())
12351 return SDValue();
12352
12353 const unsigned EltSize = EltVT.getSizeInBits();
12354 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12355
12356 // This is only a truncation if the original element type is larger.
12357 if (V0EltSize <= EltSize)
12358 return SDValue();
12359
12360 assert(((V0EltSize % EltSize) == 0) &&
12361 "Scalar type sizes must all be powers of 2 on x86!");
12362
12363 const unsigned V0Opc = V0.getOpcode();
12364 const unsigned Scale = V0EltSize / EltSize;
12365 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12366
12367 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12368 V0Opc != ISD::BUILD_VECTOR)
12369 return SDValue();
12370
12371 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12372
12373 // If we're extracting non-least-significant bits, shift so we can truncate.
12374 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12375 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12376 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12377 if (const int OffsetIdx = BroadcastIdx % Scale)
12378 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12379 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12380
12381 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12382 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12383}
12384
12385/// Test whether this can be lowered with a single SHUFPS instruction.
12386///
12387/// This is used to disable more specialized lowerings when the shufps lowering
12388/// will happen to be efficient.
12390 // This routine only handles 128-bit shufps.
12391 assert(Mask.size() == 4 && "Unsupported mask size!");
12392 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12393 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12394 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12395 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12396
12397 // To lower with a single SHUFPS we need to have the low half and high half
12398 // each requiring a single input.
12399 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12400 return false;
12401 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12402 return false;
12403
12404 return true;
12405}
12406
12407/// Test whether the specified input (0 or 1) is in-place blended by the
12408/// given mask.
12409///
12410/// This returns true if the elements from a particular input are already in the
12411/// slot required by the given mask and require no permutation.
12412static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12413 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12414 int Size = Mask.size();
12415 for (int i = 0; i < Size; ++i)
12416 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12417 return false;
12418
12419 return true;
12420}
12421
12422/// If we are extracting two 128-bit halves of a vector and shuffling the
12423/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12424/// multi-shuffle lowering.
12426 SDValue N1, ArrayRef<int> Mask,
12427 SelectionDAG &DAG) {
12428 MVT VT = N0.getSimpleValueType();
12429 assert((VT.is128BitVector() &&
12430 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12431 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12432
12433 // Check that both sources are extracts of the same source vector.
12434 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12436 N0.getOperand(0) != N1.getOperand(0) ||
12437 !N0.hasOneUse() || !N1.hasOneUse())
12438 return SDValue();
12439
12440 SDValue WideVec = N0.getOperand(0);
12441 MVT WideVT = WideVec.getSimpleValueType();
12442 if (!WideVT.is256BitVector())
12443 return SDValue();
12444
12445 // Match extracts of each half of the wide source vector. Commute the shuffle
12446 // if the extract of the low half is N1.
12447 unsigned NumElts = VT.getVectorNumElements();
12448 SmallVector<int, 4> NewMask(Mask);
12449 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12450 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12451 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12453 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12454 return SDValue();
12455
12456 // Final bailout: if the mask is simple, we are better off using an extract
12457 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12458 // because that avoids a constant load from memory.
12459 if (NumElts == 4 &&
12460 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12461 return SDValue();
12462
12463 // Extend the shuffle mask with undef elements.
12464 NewMask.append(NumElts, -1);
12465
12466 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12467 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12468 NewMask);
12469 // This is free: ymm -> xmm.
12470 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12471 DAG.getIntPtrConstant(0, DL));
12472}
12473
12474/// Try to lower broadcast of a single element.
12475///
12476/// For convenience, this code also bundles all of the subtarget feature set
12477/// filtering. While a little annoying to re-dispatch on type here, there isn't
12478/// a convenient way to factor it out.
12480 SDValue V2, ArrayRef<int> Mask,
12481 const X86Subtarget &Subtarget,
12482 SelectionDAG &DAG) {
12483 MVT EltVT = VT.getVectorElementType();
12484 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12485 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12486 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12487 return SDValue();
12488
12489 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12490 // we can only broadcast from a register with AVX2.
12491 unsigned NumEltBits = VT.getScalarSizeInBits();
12492 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12495 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12496
12497 // Check that the mask is a broadcast.
12498 int BroadcastIdx = getSplatIndex(Mask);
12499 if (BroadcastIdx < 0)
12500 return SDValue();
12501 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12502 "a sorted mask where the broadcast "
12503 "comes from V1.");
12504
12505 // Go up the chain of (vector) values to find a scalar load that we can
12506 // combine with the broadcast.
12507 // TODO: Combine this logic with findEltLoadSrc() used by
12508 // EltsFromConsecutiveLoads().
12509 int BitOffset = BroadcastIdx * NumEltBits;
12510 SDValue V = V1;
12511 for (;;) {
12512 switch (V.getOpcode()) {
12513 case ISD::BITCAST: {
12514 V = V.getOperand(0);
12515 continue;
12516 }
12517 case ISD::CONCAT_VECTORS: {
12518 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12519 int OpIdx = BitOffset / OpBitWidth;
12520 V = V.getOperand(OpIdx);
12521 BitOffset %= OpBitWidth;
12522 continue;
12523 }
12525 // The extraction index adds to the existing offset.
12526 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12527 unsigned Idx = V.getConstantOperandVal(1);
12528 unsigned BeginOffset = Idx * EltBitWidth;
12529 BitOffset += BeginOffset;
12530 V = V.getOperand(0);
12531 continue;
12532 }
12533 case ISD::INSERT_SUBVECTOR: {
12534 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12535 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12536 int Idx = (int)V.getConstantOperandVal(2);
12537 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12538 int BeginOffset = Idx * EltBitWidth;
12539 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12540 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12541 BitOffset -= BeginOffset;
12542 V = VInner;
12543 } else {
12544 V = VOuter;
12545 }
12546 continue;
12547 }
12548 }
12549 break;
12550 }
12551 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12552 BroadcastIdx = BitOffset / NumEltBits;
12553
12554 // Do we need to bitcast the source to retrieve the original broadcast index?
12555 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12556
12557 // Check if this is a broadcast of a scalar. We special case lowering
12558 // for scalars so that we can more effectively fold with loads.
12559 // If the original value has a larger element type than the shuffle, the
12560 // broadcast element is in essence truncated. Make that explicit to ease
12561 // folding.
12562 if (BitCastSrc && VT.isInteger())
12563 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12564 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12565 return TruncBroadcast;
12566
12567 // Also check the simpler case, where we can directly reuse the scalar.
12568 if (!BitCastSrc &&
12569 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12570 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12571 V = V.getOperand(BroadcastIdx);
12572
12573 // If we can't broadcast from a register, check that the input is a load.
12574 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12575 return SDValue();
12576 } else if (ISD::isNormalLoad(V.getNode()) &&
12577 cast<LoadSDNode>(V)->isSimple()) {
12578 // We do not check for one-use of the vector load because a broadcast load
12579 // is expected to be a win for code size, register pressure, and possibly
12580 // uops even if the original vector load is not eliminated.
12581
12582 // Reduce the vector load and shuffle to a broadcasted scalar load.
12583 LoadSDNode *Ld = cast<LoadSDNode>(V);
12584 SDValue BaseAddr = Ld->getOperand(1);
12585 MVT SVT = VT.getScalarType();
12586 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12587 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12588 SDValue NewAddr =
12590
12591 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12592 // than MOVDDUP.
12593 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12594 if (Opcode == X86ISD::VBROADCAST) {
12595 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12596 SDValue Ops[] = {Ld->getChain(), NewAddr};
12597 V = DAG.getMemIntrinsicNode(
12598 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12600 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12602 return DAG.getBitcast(VT, V);
12603 }
12604 assert(SVT == MVT::f64 && "Unexpected VT!");
12605 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12607 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12609 } else if (!BroadcastFromReg) {
12610 // We can't broadcast from a vector register.
12611 return SDValue();
12612 } else if (BitOffset != 0) {
12613 // We can only broadcast from the zero-element of a vector register,
12614 // but it can be advantageous to broadcast from the zero-element of a
12615 // subvector.
12616 if (!VT.is256BitVector() && !VT.is512BitVector())
12617 return SDValue();
12618
12619 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12620 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12621 return SDValue();
12622
12623 // Only broadcast the zero-element of a 128-bit subvector.
12624 if ((BitOffset % 128) != 0)
12625 return SDValue();
12626
12627 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12628 "Unexpected bit-offset");
12629 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12630 "Unexpected vector size");
12631 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12632 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12633 }
12634
12635 // On AVX we can use VBROADCAST directly for scalar sources.
12636 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12637 V = DAG.getBitcast(MVT::f64, V);
12638 if (Subtarget.hasAVX()) {
12639 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12640 return DAG.getBitcast(VT, V);
12641 }
12642 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12643 }
12644
12645 // If this is a scalar, do the broadcast on this type and bitcast.
12646 if (!V.getValueType().isVector()) {
12647 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12648 "Unexpected scalar size");
12649 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12651 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12652 }
12653
12654 // We only support broadcasting from 128-bit vectors to minimize the
12655 // number of patterns we need to deal with in isel. So extract down to
12656 // 128-bits, removing as many bitcasts as possible.
12657 if (V.getValueSizeInBits() > 128)
12659
12660 // Otherwise cast V to a vector with the same element type as VT, but
12661 // possibly narrower than VT. Then perform the broadcast.
12662 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12663 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12664 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12665}
12666
12667// Check for whether we can use INSERTPS to perform the shuffle. We only use
12668// INSERTPS when the V1 elements are already in the correct locations
12669// because otherwise we can just always use two SHUFPS instructions which
12670// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12671// perform INSERTPS if a single V1 element is out of place and all V2
12672// elements are zeroable.
12674 unsigned &InsertPSMask,
12675 const APInt &Zeroable,
12676 ArrayRef<int> Mask, SelectionDAG &DAG) {
12677 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12678 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12679 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12680
12681 // Attempt to match INSERTPS with one element from VA or VB being
12682 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12683 // are updated.
12684 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12685 ArrayRef<int> CandidateMask) {
12686 unsigned ZMask = 0;
12687 int VADstIndex = -1;
12688 int VBDstIndex = -1;
12689 bool VAUsedInPlace = false;
12690
12691 for (int i = 0; i < 4; ++i) {
12692 // Synthesize a zero mask from the zeroable elements (includes undefs).
12693 if (Zeroable[i]) {
12694 ZMask |= 1 << i;
12695 continue;
12696 }
12697
12698 // Flag if we use any VA inputs in place.
12699 if (i == CandidateMask[i]) {
12700 VAUsedInPlace = true;
12701 continue;
12702 }
12703
12704 // We can only insert a single non-zeroable element.
12705 if (VADstIndex >= 0 || VBDstIndex >= 0)
12706 return false;
12707
12708 if (CandidateMask[i] < 4) {
12709 // VA input out of place for insertion.
12710 VADstIndex = i;
12711 } else {
12712 // VB input for insertion.
12713 VBDstIndex = i;
12714 }
12715 }
12716
12717 // Don't bother if we have no (non-zeroable) element for insertion.
12718 if (VADstIndex < 0 && VBDstIndex < 0)
12719 return false;
12720
12721 // Determine element insertion src/dst indices. The src index is from the
12722 // start of the inserted vector, not the start of the concatenated vector.
12723 unsigned VBSrcIndex = 0;
12724 if (VADstIndex >= 0) {
12725 // If we have a VA input out of place, we use VA as the V2 element
12726 // insertion and don't use the original V2 at all.
12727 VBSrcIndex = CandidateMask[VADstIndex];
12728 VBDstIndex = VADstIndex;
12729 VB = VA;
12730 } else {
12731 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
12732 }
12733
12734 // If no V1 inputs are used in place, then the result is created only from
12735 // the zero mask and the V2 insertion - so remove V1 dependency.
12736 if (!VAUsedInPlace)
12737 VA = DAG.getUNDEF(MVT::v4f32);
12738
12739 // Update V1, V2 and InsertPSMask accordingly.
12740 V1 = VA;
12741 V2 = VB;
12742
12743 // Insert the V2 element into the desired position.
12744 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
12745 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
12746 return true;
12747 };
12748
12749 if (matchAsInsertPS(V1, V2, Mask))
12750 return true;
12751
12752 // Commute and try again.
12753 SmallVector<int, 4> CommutedMask(Mask);
12755 if (matchAsInsertPS(V2, V1, CommutedMask))
12756 return true;
12757
12758 return false;
12759}
12760
12762 ArrayRef<int> Mask, const APInt &Zeroable,
12763 SelectionDAG &DAG) {
12764 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12765 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12766
12767 // Attempt to match the insertps pattern.
12768 unsigned InsertPSMask = 0;
12769 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
12770 return SDValue();
12771
12772 // Insert the V2 element into the desired position.
12773 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
12774 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
12775}
12776
12777/// Handle lowering of 2-lane 64-bit floating point shuffles.
12778///
12779/// This is the basis function for the 2-lane 64-bit shuffles as we have full
12780/// support for floating point shuffles but not integer shuffles. These
12781/// instructions will incur a domain crossing penalty on some chips though so
12782/// it is better to avoid lowering through this for integer vectors where
12783/// possible.
12785 const APInt &Zeroable, SDValue V1, SDValue V2,
12786 const X86Subtarget &Subtarget,
12787 SelectionDAG &DAG) {
12788 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12789 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12790 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12791
12792 if (V2.isUndef()) {
12793 // Check for being able to broadcast a single element.
12794 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
12795 Mask, Subtarget, DAG))
12796 return Broadcast;
12797
12798 // Straight shuffle of a single input vector. Simulate this by using the
12799 // single input as both of the "inputs" to this instruction..
12800 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
12801
12802 if (Subtarget.hasAVX()) {
12803 // If we have AVX, we can use VPERMILPS which will allow folding a load
12804 // into the shuffle.
12805 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
12806 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12807 }
12808
12809 return DAG.getNode(
12810 X86ISD::SHUFP, DL, MVT::v2f64,
12811 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12812 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12813 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12814 }
12815 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12816 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12817 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12818 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12819
12820 if (Subtarget.hasAVX2())
12821 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12822 return Extract;
12823
12824 // When loading a scalar and then shuffling it into a vector we can often do
12825 // the insertion cheaply.
12827 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12828 return Insertion;
12829 // Try inverting the insertion since for v2 masks it is easy to do and we
12830 // can't reliably sort the mask one way or the other.
12831 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
12832 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
12834 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12835 return Insertion;
12836
12837 // Try to use one of the special instruction patterns to handle two common
12838 // blend patterns if a zero-blend above didn't work.
12839 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
12840 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
12841 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
12842 // We can either use a special instruction to load over the low double or
12843 // to move just the low double.
12844 return DAG.getNode(
12845 X86ISD::MOVSD, DL, MVT::v2f64, V2,
12846 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
12847
12848 if (Subtarget.hasSSE41())
12849 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
12850 Zeroable, Subtarget, DAG))
12851 return Blend;
12852
12853 // Use dedicated unpack instructions for masks that match their pattern.
12854 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
12855 return V;
12856
12857 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
12858 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
12859 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12860}
12861
12862/// Handle lowering of 2-lane 64-bit integer shuffles.
12863///
12864/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
12865/// the integer unit to minimize domain crossing penalties. However, for blends
12866/// it falls back to the floating point shuffle operation with appropriate bit
12867/// casting.
12869 const APInt &Zeroable, SDValue V1, SDValue V2,
12870 const X86Subtarget &Subtarget,
12871 SelectionDAG &DAG) {
12872 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12873 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12874 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12875
12876 if (V2.isUndef()) {
12877 // Check for being able to broadcast a single element.
12878 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
12879 Mask, Subtarget, DAG))
12880 return Broadcast;
12881
12882 // Straight shuffle of a single input vector. For everything from SSE2
12883 // onward this has a single fast instruction with no scary immediates.
12884 // We have to map the mask as it is actually a v4i32 shuffle instruction.
12885 V1 = DAG.getBitcast(MVT::v4i32, V1);
12886 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
12887 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
12888 Mask[1] < 0 ? -1 : (Mask[1] * 2),
12889 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
12890 return DAG.getBitcast(
12891 MVT::v2i64,
12892 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12893 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
12894 }
12895 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
12896 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
12897 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12898 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12899
12900 if (Subtarget.hasAVX2())
12901 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12902 return Extract;
12903
12904 // Try to use shift instructions.
12905 if (SDValue Shift =
12906 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
12907 DAG, /*BitwiseOnly*/ false))
12908 return Shift;
12909
12910 // When loading a scalar and then shuffling it into a vector we can often do
12911 // the insertion cheaply.
12913 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12914 return Insertion;
12915 // Try inverting the insertion since for v2 masks it is easy to do and we
12916 // can't reliably sort the mask one way or the other.
12917 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
12919 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12920 return Insertion;
12921
12922 // We have different paths for blend lowering, but they all must use the
12923 // *exact* same predicate.
12924 bool IsBlendSupported = Subtarget.hasSSE41();
12925 if (IsBlendSupported)
12926 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
12927 Zeroable, Subtarget, DAG))
12928 return Blend;
12929
12930 // Use dedicated unpack instructions for masks that match their pattern.
12931 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
12932 return V;
12933
12934 // Try to use byte rotation instructions.
12935 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
12936 if (Subtarget.hasSSSE3()) {
12937 if (Subtarget.hasVLX())
12938 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
12939 Zeroable, Subtarget, DAG))
12940 return Rotate;
12941
12942 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
12943 Subtarget, DAG))
12944 return Rotate;
12945 }
12946
12947 // If we have direct support for blends, we should lower by decomposing into
12948 // a permute. That will be faster than the domain cross.
12949 if (IsBlendSupported)
12950 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
12951 Subtarget, DAG);
12952
12953 // We implement this with SHUFPD which is pretty lame because it will likely
12954 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
12955 // However, all the alternatives are still more cycles and newer chips don't
12956 // have this problem. It would be really nice if x86 had better shuffles here.
12957 V1 = DAG.getBitcast(MVT::v2f64, V1);
12958 V2 = DAG.getBitcast(MVT::v2f64, V2);
12959 return DAG.getBitcast(MVT::v2i64,
12960 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
12961}
12962
12963/// Lower a vector shuffle using the SHUFPS instruction.
12964///
12965/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
12966/// It makes no assumptions about whether this is the *best* lowering, it simply
12967/// uses it.
12969 ArrayRef<int> Mask, SDValue V1,
12970 SDValue V2, SelectionDAG &DAG) {
12971 SDValue LowV = V1, HighV = V2;
12972 SmallVector<int, 4> NewMask(Mask);
12973 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12974
12975 if (NumV2Elements == 1) {
12976 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
12977
12978 // Compute the index adjacent to V2Index and in the same half by toggling
12979 // the low bit.
12980 int V2AdjIndex = V2Index ^ 1;
12981
12982 if (Mask[V2AdjIndex] < 0) {
12983 // Handles all the cases where we have a single V2 element and an undef.
12984 // This will only ever happen in the high lanes because we commute the
12985 // vector otherwise.
12986 if (V2Index < 2)
12987 std::swap(LowV, HighV);
12988 NewMask[V2Index] -= 4;
12989 } else {
12990 // Handle the case where the V2 element ends up adjacent to a V1 element.
12991 // To make this work, blend them together as the first step.
12992 int V1Index = V2AdjIndex;
12993 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
12994 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
12995 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12996
12997 // Now proceed to reconstruct the final blend as we have the necessary
12998 // high or low half formed.
12999 if (V2Index < 2) {
13000 LowV = V2;
13001 HighV = V1;
13002 } else {
13003 HighV = V2;
13004 }
13005 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13006 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13007 }
13008 } else if (NumV2Elements == 2) {
13009 if (Mask[0] < 4 && Mask[1] < 4) {
13010 // Handle the easy case where we have V1 in the low lanes and V2 in the
13011 // high lanes.
13012 NewMask[2] -= 4;
13013 NewMask[3] -= 4;
13014 } else if (Mask[2] < 4 && Mask[3] < 4) {
13015 // We also handle the reversed case because this utility may get called
13016 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13017 // arrange things in the right direction.
13018 NewMask[0] -= 4;
13019 NewMask[1] -= 4;
13020 HighV = V1;
13021 LowV = V2;
13022 } else {
13023 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13024 // trying to place elements directly, just blend them and set up the final
13025 // shuffle to place them.
13026
13027 // The first two blend mask elements are for V1, the second two are for
13028 // V2.
13029 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13030 Mask[2] < 4 ? Mask[2] : Mask[3],
13031 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13032 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13033 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13034 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13035
13036 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13037 // a blend.
13038 LowV = HighV = V1;
13039 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13040 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13041 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13042 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13043 }
13044 } else if (NumV2Elements == 3) {
13045 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13046 // we can get here due to other paths (e.g repeated mask matching) that we
13047 // don't want to do another round of lowerVECTOR_SHUFFLE.
13049 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13050 }
13051 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13052 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13053}
13054
13055/// Lower 4-lane 32-bit floating point shuffles.
13056///
13057/// Uses instructions exclusively from the floating point unit to minimize
13058/// domain crossing penalties, as these are sufficient to implement all v4f32
13059/// shuffles.
13061 const APInt &Zeroable, SDValue V1, SDValue V2,
13062 const X86Subtarget &Subtarget,
13063 SelectionDAG &DAG) {
13064 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13065 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13066 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13067
13068 if (Subtarget.hasSSE41())
13069 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13070 Zeroable, Subtarget, DAG))
13071 return Blend;
13072
13073 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13074
13075 if (NumV2Elements == 0) {
13076 // Check for being able to broadcast a single element.
13077 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13078 Mask, Subtarget, DAG))
13079 return Broadcast;
13080
13081 // Use even/odd duplicate instructions for masks that match their pattern.
13082 if (Subtarget.hasSSE3()) {
13083 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13084 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13085 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13086 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13087 }
13088
13089 if (Subtarget.hasAVX()) {
13090 // If we have AVX, we can use VPERMILPS which will allow folding a load
13091 // into the shuffle.
13092 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13093 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13094 }
13095
13096 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13097 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13098 if (!Subtarget.hasSSE2()) {
13099 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13100 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13101 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13102 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13103 }
13104
13105 // Otherwise, use a straight shuffle of a single input vector. We pass the
13106 // input vector to both operands to simulate this with a SHUFPS.
13107 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13108 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13109 }
13110
13111 if (Subtarget.hasSSE2())
13113 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13114 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13115 return ZExt;
13116 }
13117
13118 if (Subtarget.hasAVX2())
13119 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13120 return Extract;
13121
13122 // There are special ways we can lower some single-element blends. However, we
13123 // have custom ways we can lower more complex single-element blends below that
13124 // we defer to if both this and BLENDPS fail to match, so restrict this to
13125 // when the V2 input is targeting element 0 of the mask -- that is the fast
13126 // case here.
13127 if (NumV2Elements == 1 && Mask[0] >= 4)
13129 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13130 return V;
13131
13132 if (Subtarget.hasSSE41()) {
13133 // Use INSERTPS if we can complete the shuffle efficiently.
13134 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13135 return V;
13136
13137 if (!isSingleSHUFPSMask(Mask))
13138 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13139 V2, Mask, DAG))
13140 return BlendPerm;
13141 }
13142
13143 // Use low/high mov instructions. These are only valid in SSE1 because
13144 // otherwise they are widened to v2f64 and never get here.
13145 if (!Subtarget.hasSSE2()) {
13146 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13147 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13148 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13149 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13150 }
13151
13152 // Use dedicated unpack instructions for masks that match their pattern.
13153 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
13154 return V;
13155
13156 // Otherwise fall back to a SHUFPS lowering strategy.
13157 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13158}
13159
13160/// Lower 4-lane i32 vector shuffles.
13161///
13162/// We try to handle these with integer-domain shuffles where we can, but for
13163/// blends we use the floating point domain blend instructions.
13165 const APInt &Zeroable, SDValue V1, SDValue V2,
13166 const X86Subtarget &Subtarget,
13167 SelectionDAG &DAG) {
13168 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13169 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13170 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13171
13172 // Whenever we can lower this as a zext, that instruction is strictly faster
13173 // than any alternative. It also allows us to fold memory operands into the
13174 // shuffle in many cases.
13175 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13176 Zeroable, Subtarget, DAG))
13177 return ZExt;
13178
13179 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13180
13181 // Try to use shift instructions if fast.
13182 if (Subtarget.preferLowerShuffleAsShift()) {
13183 if (SDValue Shift =
13184 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13185 Subtarget, DAG, /*BitwiseOnly*/ true))
13186 return Shift;
13187 if (NumV2Elements == 0)
13188 if (SDValue Rotate =
13189 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13190 return Rotate;
13191 }
13192
13193 if (NumV2Elements == 0) {
13194 // Try to use broadcast unless the mask only has one non-undef element.
13195 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13196 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13197 Mask, Subtarget, DAG))
13198 return Broadcast;
13199 }
13200
13201 // Straight shuffle of a single input vector. For everything from SSE2
13202 // onward this has a single fast instruction with no scary immediates.
13203 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13204 // but we aren't actually going to use the UNPCK instruction because doing
13205 // so prevents folding a load into this instruction or making a copy.
13206 const int UnpackLoMask[] = {0, 0, 1, 1};
13207 const int UnpackHiMask[] = {2, 2, 3, 3};
13208 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13209 Mask = UnpackLoMask;
13210 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13211 Mask = UnpackHiMask;
13212
13213 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13214 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13215 }
13216
13217 if (Subtarget.hasAVX2())
13218 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13219 return Extract;
13220
13221 // Try to use shift instructions.
13222 if (SDValue Shift =
13223 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13224 DAG, /*BitwiseOnly*/ false))
13225 return Shift;
13226
13227 // There are special ways we can lower some single-element blends.
13228 if (NumV2Elements == 1)
13230 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13231 return V;
13232
13233 // We have different paths for blend lowering, but they all must use the
13234 // *exact* same predicate.
13235 bool IsBlendSupported = Subtarget.hasSSE41();
13236 if (IsBlendSupported)
13237 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13238 Zeroable, Subtarget, DAG))
13239 return Blend;
13240
13241 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13242 Zeroable, Subtarget, DAG))
13243 return Masked;
13244
13245 // Use dedicated unpack instructions for masks that match their pattern.
13246 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
13247 return V;
13248
13249 // Try to use byte rotation instructions.
13250 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13251 if (Subtarget.hasSSSE3()) {
13252 if (Subtarget.hasVLX())
13253 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13254 Zeroable, Subtarget, DAG))
13255 return Rotate;
13256
13257 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13258 Subtarget, DAG))
13259 return Rotate;
13260 }
13261
13262 // Assume that a single SHUFPS is faster than an alternative sequence of
13263 // multiple instructions (even if the CPU has a domain penalty).
13264 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13265 if (!isSingleSHUFPSMask(Mask)) {
13266 // If we have direct support for blends, we should lower by decomposing into
13267 // a permute. That will be faster than the domain cross.
13268 if (IsBlendSupported)
13269 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13270 Subtarget, DAG);
13271
13272 // Try to lower by permuting the inputs into an unpack instruction.
13273 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13274 Mask, Subtarget, DAG))
13275 return Unpack;
13276 }
13277
13278 // We implement this with SHUFPS because it can blend from two vectors.
13279 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13280 // up the inputs, bypassing domain shift penalties that we would incur if we
13281 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13282 // relevant.
13283 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13284 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13285 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13286 return DAG.getBitcast(MVT::v4i32, ShufPS);
13287}
13288
13289/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13290/// shuffle lowering, and the most complex part.
13291///
13292/// The lowering strategy is to try to form pairs of input lanes which are
13293/// targeted at the same half of the final vector, and then use a dword shuffle
13294/// to place them onto the right half, and finally unpack the paired lanes into
13295/// their final position.
13296///
13297/// The exact breakdown of how to form these dword pairs and align them on the
13298/// correct sides is really tricky. See the comments within the function for
13299/// more of the details.
13300///
13301/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13302/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13303/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13304/// vector, form the analogous 128-bit 8-element Mask.
13306 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13307 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13308 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13309 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13310
13311 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13312 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13313 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13314
13315 // Attempt to directly match PSHUFLW or PSHUFHW.
13316 if (isUndefOrInRange(LoMask, 0, 4) &&
13317 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13318 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13319 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13320 }
13321 if (isUndefOrInRange(HiMask, 4, 8) &&
13322 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13323 for (int i = 0; i != 4; ++i)
13324 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13325 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13326 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13327 }
13328
13329 SmallVector<int, 4> LoInputs;
13330 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13331 array_pod_sort(LoInputs.begin(), LoInputs.end());
13332 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13333 SmallVector<int, 4> HiInputs;
13334 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13335 array_pod_sort(HiInputs.begin(), HiInputs.end());
13336 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13337 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13338 int NumHToL = LoInputs.size() - NumLToL;
13339 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13340 int NumHToH = HiInputs.size() - NumLToH;
13341 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13342 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13343 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13344 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13345
13346 // If we are shuffling values from one half - check how many different DWORD
13347 // pairs we need to create. If only 1 or 2 then we can perform this as a
13348 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13349 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13350 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13351 V = DAG.getNode(ShufWOp, DL, VT, V,
13352 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13353 V = DAG.getBitcast(PSHUFDVT, V);
13354 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13355 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13356 return DAG.getBitcast(VT, V);
13357 };
13358
13359 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13360 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13361 SmallVector<std::pair<int, int>, 4> DWordPairs;
13362 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13363
13364 // Collect the different DWORD pairs.
13365 for (int DWord = 0; DWord != 4; ++DWord) {
13366 int M0 = Mask[2 * DWord + 0];
13367 int M1 = Mask[2 * DWord + 1];
13368 M0 = (M0 >= 0 ? M0 % 4 : M0);
13369 M1 = (M1 >= 0 ? M1 % 4 : M1);
13370 if (M0 < 0 && M1 < 0)
13371 continue;
13372
13373 bool Match = false;
13374 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13375 auto &DWordPair = DWordPairs[j];
13376 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13377 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13378 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13379 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13380 PSHUFDMask[DWord] = DOffset + j;
13381 Match = true;
13382 break;
13383 }
13384 }
13385 if (!Match) {
13386 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13387 DWordPairs.push_back(std::make_pair(M0, M1));
13388 }
13389 }
13390
13391 if (DWordPairs.size() <= 2) {
13392 DWordPairs.resize(2, std::make_pair(-1, -1));
13393 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13394 DWordPairs[1].first, DWordPairs[1].second};
13395 if ((NumHToL + NumHToH) == 0)
13396 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13397 if ((NumLToL + NumLToH) == 0)
13398 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13399 }
13400 }
13401
13402 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13403 // such inputs we can swap two of the dwords across the half mark and end up
13404 // with <=2 inputs to each half in each half. Once there, we can fall through
13405 // to the generic code below. For example:
13406 //
13407 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13408 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13409 //
13410 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13411 // and an existing 2-into-2 on the other half. In this case we may have to
13412 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13413 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13414 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13415 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13416 // half than the one we target for fixing) will be fixed when we re-enter this
13417 // path. We will also combine away any sequence of PSHUFD instructions that
13418 // result into a single instruction. Here is an example of the tricky case:
13419 //
13420 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13421 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13422 //
13423 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13424 //
13425 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13426 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13427 //
13428 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13429 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13430 //
13431 // The result is fine to be handled by the generic logic.
13432 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13433 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13434 int AOffset, int BOffset) {
13435 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13436 "Must call this with A having 3 or 1 inputs from the A half.");
13437 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13438 "Must call this with B having 1 or 3 inputs from the B half.");
13439 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13440 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13441
13442 bool ThreeAInputs = AToAInputs.size() == 3;
13443
13444 // Compute the index of dword with only one word among the three inputs in
13445 // a half by taking the sum of the half with three inputs and subtracting
13446 // the sum of the actual three inputs. The difference is the remaining
13447 // slot.
13448 int ADWord = 0, BDWord = 0;
13449 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13450 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13451 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13452 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13453 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13454 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13455 int TripleNonInputIdx =
13456 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13457 TripleDWord = TripleNonInputIdx / 2;
13458
13459 // We use xor with one to compute the adjacent DWord to whichever one the
13460 // OneInput is in.
13461 OneInputDWord = (OneInput / 2) ^ 1;
13462
13463 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13464 // and BToA inputs. If there is also such a problem with the BToB and AToB
13465 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13466 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13467 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13468 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13469 // Compute how many inputs will be flipped by swapping these DWords. We
13470 // need
13471 // to balance this to ensure we don't form a 3-1 shuffle in the other
13472 // half.
13473 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13474 llvm::count(AToBInputs, 2 * ADWord + 1);
13475 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13476 llvm::count(BToBInputs, 2 * BDWord + 1);
13477 if ((NumFlippedAToBInputs == 1 &&
13478 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13479 (NumFlippedBToBInputs == 1 &&
13480 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13481 // We choose whether to fix the A half or B half based on whether that
13482 // half has zero flipped inputs. At zero, we may not be able to fix it
13483 // with that half. We also bias towards fixing the B half because that
13484 // will more commonly be the high half, and we have to bias one way.
13485 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13486 ArrayRef<int> Inputs) {
13487 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13488 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13489 // Determine whether the free index is in the flipped dword or the
13490 // unflipped dword based on where the pinned index is. We use this bit
13491 // in an xor to conditionally select the adjacent dword.
13492 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13493 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13494 if (IsFixIdxInput == IsFixFreeIdxInput)
13495 FixFreeIdx += 1;
13496 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13497 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13498 "We need to be changing the number of flipped inputs!");
13499 int PSHUFHalfMask[] = {0, 1, 2, 3};
13500 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13501 V = DAG.getNode(
13502 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13503 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13504 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13505
13506 for (int &M : Mask)
13507 if (M >= 0 && M == FixIdx)
13508 M = FixFreeIdx;
13509 else if (M >= 0 && M == FixFreeIdx)
13510 M = FixIdx;
13511 };
13512 if (NumFlippedBToBInputs != 0) {
13513 int BPinnedIdx =
13514 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13515 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13516 } else {
13517 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13518 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13519 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13520 }
13521 }
13522 }
13523
13524 int PSHUFDMask[] = {0, 1, 2, 3};
13525 PSHUFDMask[ADWord] = BDWord;
13526 PSHUFDMask[BDWord] = ADWord;
13527 V = DAG.getBitcast(
13528 VT,
13529 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13530 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13531
13532 // Adjust the mask to match the new locations of A and B.
13533 for (int &M : Mask)
13534 if (M >= 0 && M/2 == ADWord)
13535 M = 2 * BDWord + M % 2;
13536 else if (M >= 0 && M/2 == BDWord)
13537 M = 2 * ADWord + M % 2;
13538
13539 // Recurse back into this routine to re-compute state now that this isn't
13540 // a 3 and 1 problem.
13541 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13542 };
13543 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13544 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13545 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13546 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13547
13548 // At this point there are at most two inputs to the low and high halves from
13549 // each half. That means the inputs can always be grouped into dwords and
13550 // those dwords can then be moved to the correct half with a dword shuffle.
13551 // We use at most one low and one high word shuffle to collect these paired
13552 // inputs into dwords, and finally a dword shuffle to place them.
13553 int PSHUFLMask[4] = {-1, -1, -1, -1};
13554 int PSHUFHMask[4] = {-1, -1, -1, -1};
13555 int PSHUFDMask[4] = {-1, -1, -1, -1};
13556
13557 // First fix the masks for all the inputs that are staying in their
13558 // original halves. This will then dictate the targets of the cross-half
13559 // shuffles.
13560 auto fixInPlaceInputs =
13561 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13562 MutableArrayRef<int> SourceHalfMask,
13563 MutableArrayRef<int> HalfMask, int HalfOffset) {
13564 if (InPlaceInputs.empty())
13565 return;
13566 if (InPlaceInputs.size() == 1) {
13567 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13568 InPlaceInputs[0] - HalfOffset;
13569 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13570 return;
13571 }
13572 if (IncomingInputs.empty()) {
13573 // Just fix all of the in place inputs.
13574 for (int Input : InPlaceInputs) {
13575 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13576 PSHUFDMask[Input / 2] = Input / 2;
13577 }
13578 return;
13579 }
13580
13581 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13582 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13583 InPlaceInputs[0] - HalfOffset;
13584 // Put the second input next to the first so that they are packed into
13585 // a dword. We find the adjacent index by toggling the low bit.
13586 int AdjIndex = InPlaceInputs[0] ^ 1;
13587 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13588 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13589 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13590 };
13591 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13592 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13593
13594 // Now gather the cross-half inputs and place them into a free dword of
13595 // their target half.
13596 // FIXME: This operation could almost certainly be simplified dramatically to
13597 // look more like the 3-1 fixing operation.
13598 auto moveInputsToRightHalf = [&PSHUFDMask](
13599 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13600 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13601 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13602 int DestOffset) {
13603 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13604 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13605 };
13606 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13607 int Word) {
13608 int LowWord = Word & ~1;
13609 int HighWord = Word | 1;
13610 return isWordClobbered(SourceHalfMask, LowWord) ||
13611 isWordClobbered(SourceHalfMask, HighWord);
13612 };
13613
13614 if (IncomingInputs.empty())
13615 return;
13616
13617 if (ExistingInputs.empty()) {
13618 // Map any dwords with inputs from them into the right half.
13619 for (int Input : IncomingInputs) {
13620 // If the source half mask maps over the inputs, turn those into
13621 // swaps and use the swapped lane.
13622 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13623 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13624 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13625 Input - SourceOffset;
13626 // We have to swap the uses in our half mask in one sweep.
13627 for (int &M : HalfMask)
13628 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13629 M = Input;
13630 else if (M == Input)
13631 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13632 } else {
13633 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13634 Input - SourceOffset &&
13635 "Previous placement doesn't match!");
13636 }
13637 // Note that this correctly re-maps both when we do a swap and when
13638 // we observe the other side of the swap above. We rely on that to
13639 // avoid swapping the members of the input list directly.
13640 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13641 }
13642
13643 // Map the input's dword into the correct half.
13644 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13645 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13646 else
13647 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13648 Input / 2 &&
13649 "Previous placement doesn't match!");
13650 }
13651
13652 // And just directly shift any other-half mask elements to be same-half
13653 // as we will have mirrored the dword containing the element into the
13654 // same position within that half.
13655 for (int &M : HalfMask)
13656 if (M >= SourceOffset && M < SourceOffset + 4) {
13657 M = M - SourceOffset + DestOffset;
13658 assert(M >= 0 && "This should never wrap below zero!");
13659 }
13660 return;
13661 }
13662
13663 // Ensure we have the input in a viable dword of its current half. This
13664 // is particularly tricky because the original position may be clobbered
13665 // by inputs being moved and *staying* in that half.
13666 if (IncomingInputs.size() == 1) {
13667 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13668 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13669 SourceOffset;
13670 SourceHalfMask[InputFixed - SourceOffset] =
13671 IncomingInputs[0] - SourceOffset;
13672 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13673 InputFixed);
13674 IncomingInputs[0] = InputFixed;
13675 }
13676 } else if (IncomingInputs.size() == 2) {
13677 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13678 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13679 // We have two non-adjacent or clobbered inputs we need to extract from
13680 // the source half. To do this, we need to map them into some adjacent
13681 // dword slot in the source mask.
13682 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13683 IncomingInputs[1] - SourceOffset};
13684
13685 // If there is a free slot in the source half mask adjacent to one of
13686 // the inputs, place the other input in it. We use (Index XOR 1) to
13687 // compute an adjacent index.
13688 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13689 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13690 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13691 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13692 InputsFixed[1] = InputsFixed[0] ^ 1;
13693 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13694 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13695 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13696 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13697 InputsFixed[0] = InputsFixed[1] ^ 1;
13698 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13699 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13700 // The two inputs are in the same DWord but it is clobbered and the
13701 // adjacent DWord isn't used at all. Move both inputs to the free
13702 // slot.
13703 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13704 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13705 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13706 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13707 } else {
13708 // The only way we hit this point is if there is no clobbering
13709 // (because there are no off-half inputs to this half) and there is no
13710 // free slot adjacent to one of the inputs. In this case, we have to
13711 // swap an input with a non-input.
13712 for (int i = 0; i < 4; ++i)
13713 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13714 "We can't handle any clobbers here!");
13715 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13716 "Cannot have adjacent inputs here!");
13717
13718 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13719 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13720
13721 // We also have to update the final source mask in this case because
13722 // it may need to undo the above swap.
13723 for (int &M : FinalSourceHalfMask)
13724 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13725 M = InputsFixed[1] + SourceOffset;
13726 else if (M == InputsFixed[1] + SourceOffset)
13727 M = (InputsFixed[0] ^ 1) + SourceOffset;
13728
13729 InputsFixed[1] = InputsFixed[0] ^ 1;
13730 }
13731
13732 // Point everything at the fixed inputs.
13733 for (int &M : HalfMask)
13734 if (M == IncomingInputs[0])
13735 M = InputsFixed[0] + SourceOffset;
13736 else if (M == IncomingInputs[1])
13737 M = InputsFixed[1] + SourceOffset;
13738
13739 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
13740 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
13741 }
13742 } else {
13743 llvm_unreachable("Unhandled input size!");
13744 }
13745
13746 // Now hoist the DWord down to the right half.
13747 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
13748 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
13749 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
13750 for (int &M : HalfMask)
13751 for (int Input : IncomingInputs)
13752 if (M == Input)
13753 M = FreeDWord * 2 + Input % 2;
13754 };
13755 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
13756 /*SourceOffset*/ 4, /*DestOffset*/ 0);
13757 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
13758 /*SourceOffset*/ 0, /*DestOffset*/ 4);
13759
13760 // Now enact all the shuffles we've computed to move the inputs into their
13761 // target half.
13762 if (!isNoopShuffleMask(PSHUFLMask))
13763 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13764 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
13765 if (!isNoopShuffleMask(PSHUFHMask))
13766 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13767 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
13768 if (!isNoopShuffleMask(PSHUFDMask))
13769 V = DAG.getBitcast(
13770 VT,
13771 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13772 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13773
13774 // At this point, each half should contain all its inputs, and we can then
13775 // just shuffle them into their final position.
13776 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
13777 "Failed to lift all the high half inputs to the low mask!");
13778 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
13779 "Failed to lift all the low half inputs to the high mask!");
13780
13781 // Do a half shuffle for the low mask.
13782 if (!isNoopShuffleMask(LoMask))
13783 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13784 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13785
13786 // Do a half shuffle with the high mask after shifting its values down.
13787 for (int &M : HiMask)
13788 if (M >= 0)
13789 M -= 4;
13790 if (!isNoopShuffleMask(HiMask))
13791 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13792 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13793
13794 return V;
13795}
13796
13797/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
13798/// blend if only one input is used.
13800 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13801 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
13803 "Lane crossing shuffle masks not supported");
13804
13805 int NumBytes = VT.getSizeInBits() / 8;
13806 int Size = Mask.size();
13807 int Scale = NumBytes / Size;
13808
13809 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13810 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13811 V1InUse = false;
13812 V2InUse = false;
13813
13814 for (int i = 0; i < NumBytes; ++i) {
13815 int M = Mask[i / Scale];
13816 if (M < 0)
13817 continue;
13818
13819 const int ZeroMask = 0x80;
13820 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
13821 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
13822 if (Zeroable[i / Scale])
13823 V1Idx = V2Idx = ZeroMask;
13824
13825 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
13826 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
13827 V1InUse |= (ZeroMask != V1Idx);
13828 V2InUse |= (ZeroMask != V2Idx);
13829 }
13830
13831 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
13832 if (V1InUse)
13833 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
13834 DAG.getBuildVector(ShufVT, DL, V1Mask));
13835 if (V2InUse)
13836 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
13837 DAG.getBuildVector(ShufVT, DL, V2Mask));
13838
13839 // If we need shuffled inputs from both, blend the two.
13840 SDValue V;
13841 if (V1InUse && V2InUse)
13842 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
13843 else
13844 V = V1InUse ? V1 : V2;
13845
13846 // Cast the result back to the correct type.
13847 return DAG.getBitcast(VT, V);
13848}
13849
13850/// Generic lowering of 8-lane i16 shuffles.
13851///
13852/// This handles both single-input shuffles and combined shuffle/blends with
13853/// two inputs. The single input shuffles are immediately delegated to
13854/// a dedicated lowering routine.
13855///
13856/// The blends are lowered in one of three fundamental ways. If there are few
13857/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
13858/// of the input is significantly cheaper when lowered as an interleaving of
13859/// the two inputs, try to interleave them. Otherwise, blend the low and high
13860/// halves of the inputs separately (making them have relatively few inputs)
13861/// and then concatenate them.
13863 const APInt &Zeroable, SDValue V1, SDValue V2,
13864 const X86Subtarget &Subtarget,
13865 SelectionDAG &DAG) {
13866 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13867 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13868 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13869
13870 // Whenever we can lower this as a zext, that instruction is strictly faster
13871 // than any alternative.
13872 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
13873 Zeroable, Subtarget, DAG))
13874 return ZExt;
13875
13876 // Try to use lower using a truncation.
13877 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13878 Subtarget, DAG))
13879 return V;
13880
13881 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
13882
13883 if (NumV2Inputs == 0) {
13884 // Try to use shift instructions.
13885 if (SDValue Shift =
13886 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
13887 Subtarget, DAG, /*BitwiseOnly*/ false))
13888 return Shift;
13889
13890 // Check for being able to broadcast a single element.
13891 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
13892 Mask, Subtarget, DAG))
13893 return Broadcast;
13894
13895 // Try to use bit rotation instructions.
13896 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
13897 Subtarget, DAG))
13898 return Rotate;
13899
13900 // Use dedicated unpack instructions for masks that match their pattern.
13901 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13902 return V;
13903
13904 // Use dedicated pack instructions for masks that match their pattern.
13905 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13906 Subtarget))
13907 return V;
13908
13909 // Try to use byte rotation instructions.
13910 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
13911 Subtarget, DAG))
13912 return Rotate;
13913
13914 // Make a copy of the mask so it can be modified.
13915 SmallVector<int, 8> MutableMask(Mask);
13916 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
13917 Subtarget, DAG);
13918 }
13919
13920 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
13921 "All single-input shuffles should be canonicalized to be V1-input "
13922 "shuffles.");
13923
13924 // Try to use shift instructions.
13925 if (SDValue Shift =
13926 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
13927 DAG, /*BitwiseOnly*/ false))
13928 return Shift;
13929
13930 // See if we can use SSE4A Extraction / Insertion.
13931 if (Subtarget.hasSSE4A())
13932 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
13933 Zeroable, DAG))
13934 return V;
13935
13936 // There are special ways we can lower some single-element blends.
13937 if (NumV2Inputs == 1)
13939 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13940 return V;
13941
13942 // We have different paths for blend lowering, but they all must use the
13943 // *exact* same predicate.
13944 bool IsBlendSupported = Subtarget.hasSSE41();
13945 if (IsBlendSupported)
13946 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
13947 Zeroable, Subtarget, DAG))
13948 return Blend;
13949
13950 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
13951 Zeroable, Subtarget, DAG))
13952 return Masked;
13953
13954 // Use dedicated unpack instructions for masks that match their pattern.
13955 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13956 return V;
13957
13958 // Use dedicated pack instructions for masks that match their pattern.
13959 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13960 Subtarget))
13961 return V;
13962
13963 // Try to use lower using a truncation.
13964 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13965 Subtarget, DAG))
13966 return V;
13967
13968 // Try to use byte rotation instructions.
13969 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
13970 Subtarget, DAG))
13971 return Rotate;
13972
13973 if (SDValue BitBlend =
13974 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
13975 return BitBlend;
13976
13977 // Try to use byte shift instructions to mask.
13978 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
13979 Zeroable, Subtarget, DAG))
13980 return V;
13981
13982 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
13983 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
13984 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
13985 !Subtarget.hasVLX()) {
13986 // Check if this is part of a 256-bit vector truncation.
13987 unsigned PackOpc = 0;
13988 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
13991 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
13992 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
13993 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
13994 DAG.getTargetConstant(0xEE, DL, MVT::i8));
13995 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
13996 V1 = extract128BitVector(V1V2, 0, DAG, DL);
13997 V2 = extract128BitVector(V1V2, 4, DAG, DL);
13998 PackOpc = X86ISD::PACKUS;
13999 } else if (Subtarget.hasSSE41()) {
14000 SmallVector<SDValue, 4> DWordClearOps(4,
14001 DAG.getConstant(0, DL, MVT::i32));
14002 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14003 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14004 SDValue DWordClearMask =
14005 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14006 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14007 DWordClearMask);
14008 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14009 DWordClearMask);
14010 PackOpc = X86ISD::PACKUS;
14011 } else if (!Subtarget.hasSSSE3()) {
14012 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14013 V1 = DAG.getBitcast(MVT::v4i32, V1);
14014 V2 = DAG.getBitcast(MVT::v4i32, V2);
14015 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14016 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14017 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14018 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14019 PackOpc = X86ISD::PACKSS;
14020 }
14021 if (PackOpc) {
14022 // Now pack things back together.
14023 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14024 if (NumEvenDrops == 2) {
14025 Result = DAG.getBitcast(MVT::v4i32, Result);
14026 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14027 }
14028 return Result;
14029 }
14030 }
14031
14032 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14033 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14034 if (NumOddDrops == 1) {
14035 bool HasSSE41 = Subtarget.hasSSE41();
14036 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14037 DAG.getBitcast(MVT::v4i32, V1),
14038 DAG.getTargetConstant(16, DL, MVT::i8));
14039 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14040 DAG.getBitcast(MVT::v4i32, V2),
14041 DAG.getTargetConstant(16, DL, MVT::i8));
14042 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14043 MVT::v8i16, V1, V2);
14044 }
14045
14046 // Try to lower by permuting the inputs into an unpack instruction.
14047 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14048 Mask, Subtarget, DAG))
14049 return Unpack;
14050
14051 // If we can't directly blend but can use PSHUFB, that will be better as it
14052 // can both shuffle and set up the inefficient blend.
14053 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14054 bool V1InUse, V2InUse;
14055 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14056 Zeroable, DAG, V1InUse, V2InUse);
14057 }
14058
14059 // We can always bit-blend if we have to so the fallback strategy is to
14060 // decompose into single-input permutes and blends/unpacks.
14061 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
14062 Mask, Subtarget, DAG);
14063}
14064
14065/// Lower 8-lane 16-bit floating point shuffles.
14067 const APInt &Zeroable, SDValue V1, SDValue V2,
14068 const X86Subtarget &Subtarget,
14069 SelectionDAG &DAG) {
14070 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14071 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14072 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14073 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14074
14075 if (Subtarget.hasFP16()) {
14076 if (NumV2Elements == 0) {
14077 // Check for being able to broadcast a single element.
14078 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14079 Mask, Subtarget, DAG))
14080 return Broadcast;
14081 }
14082 if (NumV2Elements == 1 && Mask[0] >= 8)
14084 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14085 return V;
14086 }
14087
14088 V1 = DAG.getBitcast(MVT::v8i16, V1);
14089 V2 = DAG.getBitcast(MVT::v8i16, V2);
14090 return DAG.getBitcast(MVT::v8f16,
14091 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14092}
14093
14094// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14095// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14096// the active subvector is extracted.
14098 ArrayRef<int> Mask, SDValue V1, SDValue V2,
14099 const X86Subtarget &Subtarget,
14100 SelectionDAG &DAG) {
14101 MVT MaskVT = VT.changeTypeToInteger();
14102 SDValue MaskNode;
14103 MVT ShuffleVT = VT;
14104 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14105 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14106 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14107 ShuffleVT = V1.getSimpleValueType();
14108
14109 // Adjust mask to correct indices for the second input.
14110 int NumElts = VT.getVectorNumElements();
14111 unsigned Scale = 512 / VT.getSizeInBits();
14112 SmallVector<int, 32> AdjustedMask(Mask);
14113 for (int &M : AdjustedMask)
14114 if (NumElts <= M)
14115 M += (Scale - 1) * NumElts;
14116 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14117 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14118 } else {
14119 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14120 }
14121
14122 SDValue Result;
14123 if (V2.isUndef())
14124 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14125 else
14126 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14127
14128 if (VT != ShuffleVT)
14129 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14130
14131 return Result;
14132}
14133
14134/// Generic lowering of v16i8 shuffles.
14135///
14136/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14137/// detect any complexity reducing interleaving. If that doesn't help, it uses
14138/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14139/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14140/// back together.
14142 const APInt &Zeroable, SDValue V1, SDValue V2,
14143 const X86Subtarget &Subtarget,
14144 SelectionDAG &DAG) {
14145 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14146 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14147 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14148
14149 // Try to use shift instructions.
14150 if (SDValue Shift =
14151 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14152 DAG, /*BitwiseOnly*/ false))
14153 return Shift;
14154
14155 // Try to use byte rotation instructions.
14156 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14157 Subtarget, DAG))
14158 return Rotate;
14159
14160 // Use dedicated pack instructions for masks that match their pattern.
14161 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
14162 Subtarget))
14163 return V;
14164
14165 // Try to use a zext lowering.
14166 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14167 Zeroable, Subtarget, DAG))
14168 return ZExt;
14169
14170 // Try to use lower using a truncation.
14171 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14172 Subtarget, DAG))
14173 return V;
14174
14175 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14176 Subtarget, DAG))
14177 return V;
14178
14179 // See if we can use SSE4A Extraction / Insertion.
14180 if (Subtarget.hasSSE4A())
14181 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14182 Zeroable, DAG))
14183 return V;
14184
14185 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14186
14187 // For single-input shuffles, there are some nicer lowering tricks we can use.
14188 if (NumV2Elements == 0) {
14189 // Check for being able to broadcast a single element.
14190 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14191 Mask, Subtarget, DAG))
14192 return Broadcast;
14193
14194 // Try to use bit rotation instructions.
14195 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14196 Subtarget, DAG))
14197 return Rotate;
14198
14199 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14200 return V;
14201
14202 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14203 // Notably, this handles splat and partial-splat shuffles more efficiently.
14204 // However, it only makes sense if the pre-duplication shuffle simplifies
14205 // things significantly. Currently, this means we need to be able to
14206 // express the pre-duplication shuffle as an i16 shuffle.
14207 //
14208 // FIXME: We should check for other patterns which can be widened into an
14209 // i16 shuffle as well.
14210 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14211 for (int i = 0; i < 16; i += 2)
14212 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14213 return false;
14214
14215 return true;
14216 };
14217 auto tryToWidenViaDuplication = [&]() -> SDValue {
14218 if (!canWidenViaDuplication(Mask))
14219 return SDValue();
14220 SmallVector<int, 4> LoInputs;
14221 copy_if(Mask, std::back_inserter(LoInputs),
14222 [](int M) { return M >= 0 && M < 8; });
14223 array_pod_sort(LoInputs.begin(), LoInputs.end());
14224 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14225 SmallVector<int, 4> HiInputs;
14226 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14227 array_pod_sort(HiInputs.begin(), HiInputs.end());
14228 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14229
14230 bool TargetLo = LoInputs.size() >= HiInputs.size();
14231 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14232 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14233
14234 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14236 for (int I : InPlaceInputs) {
14237 PreDupI16Shuffle[I/2] = I/2;
14238 LaneMap[I] = I;
14239 }
14240 int j = TargetLo ? 0 : 4, je = j + 4;
14241 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14242 // Check if j is already a shuffle of this input. This happens when
14243 // there are two adjacent bytes after we move the low one.
14244 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14245 // If we haven't yet mapped the input, search for a slot into which
14246 // we can map it.
14247 while (j < je && PreDupI16Shuffle[j] >= 0)
14248 ++j;
14249
14250 if (j == je)
14251 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14252 return SDValue();
14253
14254 // Map this input with the i16 shuffle.
14255 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14256 }
14257
14258 // Update the lane map based on the mapping we ended up with.
14259 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14260 }
14261 V1 = DAG.getBitcast(
14262 MVT::v16i8,
14263 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14264 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14265
14266 // Unpack the bytes to form the i16s that will be shuffled into place.
14267 bool EvenInUse = false, OddInUse = false;
14268 for (int i = 0; i < 16; i += 2) {
14269 EvenInUse |= (Mask[i + 0] >= 0);
14270 OddInUse |= (Mask[i + 1] >= 0);
14271 if (EvenInUse && OddInUse)
14272 break;
14273 }
14274 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14275 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14276 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14277
14278 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14279 for (int i = 0; i < 16; ++i)
14280 if (Mask[i] >= 0) {
14281 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14282 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14283 if (PostDupI16Shuffle[i / 2] < 0)
14284 PostDupI16Shuffle[i / 2] = MappedMask;
14285 else
14286 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14287 "Conflicting entries in the original shuffle!");
14288 }
14289 return DAG.getBitcast(
14290 MVT::v16i8,
14291 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14292 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14293 };
14294 if (SDValue V = tryToWidenViaDuplication())
14295 return V;
14296 }
14297
14298 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14299 Zeroable, Subtarget, DAG))
14300 return Masked;
14301
14302 // Use dedicated unpack instructions for masks that match their pattern.
14303 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14304 return V;
14305
14306 // Try to use byte shift instructions to mask.
14307 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14308 Zeroable, Subtarget, DAG))
14309 return V;
14310
14311 // Check for compaction patterns.
14312 bool IsSingleInput = V2.isUndef();
14313 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14314
14315 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14316 // with PSHUFB. It is important to do this before we attempt to generate any
14317 // blends but after all of the single-input lowerings. If the single input
14318 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14319 // want to preserve that and we can DAG combine any longer sequences into
14320 // a PSHUFB in the end. But once we start blending from multiple inputs,
14321 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14322 // and there are *very* few patterns that would actually be faster than the
14323 // PSHUFB approach because of its ability to zero lanes.
14324 //
14325 // If the mask is a binary compaction, we can more efficiently perform this
14326 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14327 //
14328 // FIXME: The only exceptions to the above are blends which are exact
14329 // interleavings with direct instructions supporting them. We currently don't
14330 // handle those well here.
14331 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14332 bool V1InUse = false;
14333 bool V2InUse = false;
14334
14336 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14337
14338 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14339 // do so. This avoids using them to handle blends-with-zero which is
14340 // important as a single pshufb is significantly faster for that.
14341 if (V1InUse && V2InUse) {
14342 if (Subtarget.hasSSE41())
14343 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14344 Zeroable, Subtarget, DAG))
14345 return Blend;
14346
14347 // We can use an unpack to do the blending rather than an or in some
14348 // cases. Even though the or may be (very minorly) more efficient, we
14349 // preference this lowering because there are common cases where part of
14350 // the complexity of the shuffles goes away when we do the final blend as
14351 // an unpack.
14352 // FIXME: It might be worth trying to detect if the unpack-feeding
14353 // shuffles will both be pshufb, in which case we shouldn't bother with
14354 // this.
14356 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14357 return Unpack;
14358
14359 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14360 if (Subtarget.hasVBMI())
14361 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14362 DAG);
14363
14364 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14365 if (Subtarget.hasXOP()) {
14366 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14367 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14368 }
14369
14370 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14371 // PALIGNR will be cheaper than the second PSHUFB+OR.
14373 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14374 return V;
14375 }
14376
14377 return PSHUFB;
14378 }
14379
14380 // There are special ways we can lower some single-element blends.
14381 if (NumV2Elements == 1)
14383 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14384 return V;
14385
14386 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14387 return Blend;
14388
14389 // Check whether a compaction lowering can be done. This handles shuffles
14390 // which take every Nth element for some even N. See the helper function for
14391 // details.
14392 //
14393 // We special case these as they can be particularly efficiently handled with
14394 // the PACKUSB instruction on x86 and they show up in common patterns of
14395 // rearranging bytes to truncate wide elements.
14396 if (NumEvenDrops) {
14397 // NumEvenDrops is the power of two stride of the elements. Another way of
14398 // thinking about it is that we need to drop the even elements this many
14399 // times to get the original input.
14400
14401 // First we need to zero all the dropped bytes.
14402 assert(NumEvenDrops <= 3 &&
14403 "No support for dropping even elements more than 3 times.");
14404 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14405 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14406 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14407 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14408 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14409 WordClearMask);
14410 if (!IsSingleInput)
14411 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14412 WordClearMask);
14413
14414 // Now pack things back together.
14415 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14416 IsSingleInput ? V1 : V2);
14417 for (int i = 1; i < NumEvenDrops; ++i) {
14418 Result = DAG.getBitcast(MVT::v8i16, Result);
14419 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14420 }
14421 return Result;
14422 }
14423
14424 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14425 if (NumOddDrops == 1) {
14426 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14427 DAG.getBitcast(MVT::v8i16, V1),
14428 DAG.getTargetConstant(8, DL, MVT::i8));
14429 if (!IsSingleInput)
14430 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14431 DAG.getBitcast(MVT::v8i16, V2),
14432 DAG.getTargetConstant(8, DL, MVT::i8));
14433 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14434 IsSingleInput ? V1 : V2);
14435 }
14436
14437 // Handle multi-input cases by blending/unpacking single-input shuffles.
14438 if (NumV2Elements > 0)
14439 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14440 Subtarget, DAG);
14441
14442 // The fallback path for single-input shuffles widens this into two v8i16
14443 // vectors with unpacks, shuffles those, and then pulls them back together
14444 // with a pack.
14445 SDValue V = V1;
14446
14447 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14448 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14449 for (int i = 0; i < 16; ++i)
14450 if (Mask[i] >= 0)
14451 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14452
14453 SDValue VLoHalf, VHiHalf;
14454 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14455 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14456 // i16s.
14457 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14458 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14459 // Use a mask to drop the high bytes.
14460 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14461 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14462 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14463
14464 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14465 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14466
14467 // Squash the masks to point directly into VLoHalf.
14468 for (int &M : LoBlendMask)
14469 if (M >= 0)
14470 M /= 2;
14471 for (int &M : HiBlendMask)
14472 if (M >= 0)
14473 M /= 2;
14474 } else {
14475 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14476 // VHiHalf so that we can blend them as i16s.
14477 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14478
14479 VLoHalf = DAG.getBitcast(
14480 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14481 VHiHalf = DAG.getBitcast(
14482 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14483 }
14484
14485 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14486 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14487
14488 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14489}
14490
14491/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14492///
14493/// This routine breaks down the specific type of 128-bit shuffle and
14494/// dispatches to the lowering routines accordingly.
14496 MVT VT, SDValue V1, SDValue V2,
14497 const APInt &Zeroable,
14498 const X86Subtarget &Subtarget,
14499 SelectionDAG &DAG) {
14500 if (VT == MVT::v8bf16) {
14501 V1 = DAG.getBitcast(MVT::v8i16, V1);
14502 V2 = DAG.getBitcast(MVT::v8i16, V2);
14503 return DAG.getBitcast(VT,
14504 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14505 }
14506
14507 switch (VT.SimpleTy) {
14508 case MVT::v2i64:
14509 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14510 case MVT::v2f64:
14511 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14512 case MVT::v4i32:
14513 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14514 case MVT::v4f32:
14515 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14516 case MVT::v8i16:
14517 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14518 case MVT::v8f16:
14519 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14520 case MVT::v16i8:
14521 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14522
14523 default:
14524 llvm_unreachable("Unimplemented!");
14525 }
14526}
14527
14528/// Generic routine to split vector shuffle into half-sized shuffles.
14529///
14530/// This routine just extracts two subvectors, shuffles them independently, and
14531/// then concatenates them back together. This should work effectively with all
14532/// AVX vector shuffle types.
14534 SDValue V2, ArrayRef<int> Mask,
14535 SelectionDAG &DAG, bool SimpleOnly) {
14536 assert(VT.getSizeInBits() >= 256 &&
14537 "Only for 256-bit or wider vector shuffles!");
14538 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14539 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14540
14541 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14542 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14543
14544 int NumElements = VT.getVectorNumElements();
14545 int SplitNumElements = NumElements / 2;
14546 MVT ScalarVT = VT.getVectorElementType();
14547 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14548
14549 // Use splitVector/extractSubVector so that split build-vectors just build two
14550 // narrower build vectors. This helps shuffling with splats and zeros.
14551 auto SplitVector = [&](SDValue V) {
14552 SDValue LoV, HiV;
14553 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14554 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14555 DAG.getBitcast(SplitVT, HiV));
14556 };
14557
14558 SDValue LoV1, HiV1, LoV2, HiV2;
14559 std::tie(LoV1, HiV1) = SplitVector(V1);
14560 std::tie(LoV2, HiV2) = SplitVector(V2);
14561
14562 // Now create two 4-way blends of these half-width vectors.
14563 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14564 bool &UseHiV1, bool &UseLoV2,
14565 bool &UseHiV2) {
14566 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14567 for (int i = 0; i < SplitNumElements; ++i) {
14568 int M = HalfMask[i];
14569 if (M >= NumElements) {
14570 if (M >= NumElements + SplitNumElements)
14571 UseHiV2 = true;
14572 else
14573 UseLoV2 = true;
14574 } else if (M >= 0) {
14575 if (M >= SplitNumElements)
14576 UseHiV1 = true;
14577 else
14578 UseLoV1 = true;
14579 }
14580 }
14581 };
14582
14583 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14584 if (!SimpleOnly)
14585 return true;
14586
14587 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14588 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14589
14590 return !(UseHiV1 || UseHiV2);
14591 };
14592
14593 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14594 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14595 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14596 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14597 for (int i = 0; i < SplitNumElements; ++i) {
14598 int M = HalfMask[i];
14599 if (M >= NumElements) {
14600 V2BlendMask[i] = M - NumElements;
14601 BlendMask[i] = SplitNumElements + i;
14602 } else if (M >= 0) {
14603 V1BlendMask[i] = M;
14604 BlendMask[i] = i;
14605 }
14606 }
14607
14608 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14609 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14610
14611 // Because the lowering happens after all combining takes place, we need to
14612 // manually combine these blend masks as much as possible so that we create
14613 // a minimal number of high-level vector shuffle nodes.
14614 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14615
14616 // First try just blending the halves of V1 or V2.
14617 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14618 return DAG.getUNDEF(SplitVT);
14619 if (!UseLoV2 && !UseHiV2)
14620 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14621 if (!UseLoV1 && !UseHiV1)
14622 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14623
14624 SDValue V1Blend, V2Blend;
14625 if (UseLoV1 && UseHiV1) {
14626 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14627 } else {
14628 // We only use half of V1 so map the usage down into the final blend mask.
14629 V1Blend = UseLoV1 ? LoV1 : HiV1;
14630 for (int i = 0; i < SplitNumElements; ++i)
14631 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14632 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14633 }
14634 if (UseLoV2 && UseHiV2) {
14635 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14636 } else {
14637 // We only use half of V2 so map the usage down into the final blend mask.
14638 V2Blend = UseLoV2 ? LoV2 : HiV2;
14639 for (int i = 0; i < SplitNumElements; ++i)
14640 if (BlendMask[i] >= SplitNumElements)
14641 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14642 }
14643 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14644 };
14645
14646 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14647 return SDValue();
14648
14649 SDValue Lo = HalfBlend(LoMask);
14650 SDValue Hi = HalfBlend(HiMask);
14651 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14652}
14653
14654/// Either split a vector in halves or decompose the shuffles and the
14655/// blend/unpack.
14656///
14657/// This is provided as a good fallback for many lowerings of non-single-input
14658/// shuffles with more than one 128-bit lane. In those cases, we want to select
14659/// between splitting the shuffle into 128-bit components and stitching those
14660/// back together vs. extracting the single-input shuffles and blending those
14661/// results.
14663 SDValue V2, ArrayRef<int> Mask,
14664 const X86Subtarget &Subtarget,
14665 SelectionDAG &DAG) {
14666 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14667 "shuffles as it could then recurse on itself.");
14668 int Size = Mask.size();
14669
14670 // If this can be modeled as a broadcast of two elements followed by a blend,
14671 // prefer that lowering. This is especially important because broadcasts can
14672 // often fold with memory operands.
14673 auto DoBothBroadcast = [&] {
14674 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14675 for (int M : Mask)
14676 if (M >= Size) {
14677 if (V2BroadcastIdx < 0)
14678 V2BroadcastIdx = M - Size;
14679 else if (M - Size != V2BroadcastIdx)
14680 return false;
14681 } else if (M >= 0) {
14682 if (V1BroadcastIdx < 0)
14683 V1BroadcastIdx = M;
14684 else if (M != V1BroadcastIdx)
14685 return false;
14686 }
14687 return true;
14688 };
14689 if (DoBothBroadcast())
14690 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14691 DAG);
14692
14693 // If the inputs all stem from a single 128-bit lane of each input, then we
14694 // split them rather than blending because the split will decompose to
14695 // unusually few instructions.
14696 int LaneCount = VT.getSizeInBits() / 128;
14697 int LaneSize = Size / LaneCount;
14698 SmallBitVector LaneInputs[2];
14699 LaneInputs[0].resize(LaneCount, false);
14700 LaneInputs[1].resize(LaneCount, false);
14701 for (int i = 0; i < Size; ++i)
14702 if (Mask[i] >= 0)
14703 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14704 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14705 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14706 /*SimpleOnly*/ false);
14707
14708 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
14709 // requires that the decomposed single-input shuffles don't end up here.
14710 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14711 DAG);
14712}
14713
14714// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14715// TODO: Extend to support v8f32 (+ 512-bit shuffles).
14717 SDValue V1, SDValue V2,
14718 ArrayRef<int> Mask,
14719 SelectionDAG &DAG) {
14720 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
14721
14722 int LHSMask[4] = {-1, -1, -1, -1};
14723 int RHSMask[4] = {-1, -1, -1, -1};
14724 unsigned SHUFPMask = 0;
14725
14726 // As SHUFPD uses a single LHS/RHS element per lane, we can always
14727 // perform the shuffle once the lanes have been shuffled in place.
14728 for (int i = 0; i != 4; ++i) {
14729 int M = Mask[i];
14730 if (M < 0)
14731 continue;
14732 int LaneBase = i & ~1;
14733 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
14734 LaneMask[LaneBase + (M & 1)] = M;
14735 SHUFPMask |= (M & 1) << i;
14736 }
14737
14738 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
14739 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
14740 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
14741 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
14742}
14743
14744/// Lower a vector shuffle crossing multiple 128-bit lanes as
14745/// a lane permutation followed by a per-lane permutation.
14746///
14747/// This is mainly for cases where we can have non-repeating permutes
14748/// in each lane.
14749///
14750/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
14751/// we should investigate merging them.
14753 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14754 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14755 int NumElts = VT.getVectorNumElements();
14756 int NumLanes = VT.getSizeInBits() / 128;
14757 int NumEltsPerLane = NumElts / NumLanes;
14758 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
14759
14760 /// Attempts to find a sublane permute with the given size
14761 /// that gets all elements into their target lanes.
14762 ///
14763 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
14764 /// If unsuccessful, returns false and may overwrite InLaneMask.
14765 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
14766 int NumSublanesPerLane = NumSublanes / NumLanes;
14767 int NumEltsPerSublane = NumElts / NumSublanes;
14768
14769 SmallVector<int, 16> CrossLaneMask;
14770 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
14771 // CrossLaneMask but one entry == one sublane.
14772 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
14773
14774 for (int i = 0; i != NumElts; ++i) {
14775 int M = Mask[i];
14776 if (M < 0)
14777 continue;
14778
14779 int SrcSublane = M / NumEltsPerSublane;
14780 int DstLane = i / NumEltsPerLane;
14781
14782 // We only need to get the elements into the right lane, not sublane.
14783 // So search all sublanes that make up the destination lane.
14784 bool Found = false;
14785 int DstSubStart = DstLane * NumSublanesPerLane;
14786 int DstSubEnd = DstSubStart + NumSublanesPerLane;
14787 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
14788 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
14789 continue;
14790
14791 Found = true;
14792 CrossLaneMaskLarge[DstSublane] = SrcSublane;
14793 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
14794 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
14795 break;
14796 }
14797 if (!Found)
14798 return SDValue();
14799 }
14800
14801 // Fill CrossLaneMask using CrossLaneMaskLarge.
14802 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
14803
14804 if (!CanUseSublanes) {
14805 // If we're only shuffling a single lowest lane and the rest are identity
14806 // then don't bother.
14807 // TODO - isShuffleMaskInputInPlace could be extended to something like
14808 // this.
14809 int NumIdentityLanes = 0;
14810 bool OnlyShuffleLowestLane = true;
14811 for (int i = 0; i != NumLanes; ++i) {
14812 int LaneOffset = i * NumEltsPerLane;
14813 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
14814 i * NumEltsPerLane))
14815 NumIdentityLanes++;
14816 else if (CrossLaneMask[LaneOffset] != 0)
14817 OnlyShuffleLowestLane = false;
14818 }
14819 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
14820 return SDValue();
14821 }
14822
14823 // Avoid returning the same shuffle operation. For example,
14824 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
14825 // undef:v16i16
14826 if (CrossLaneMask == Mask || InLaneMask == Mask)
14827 return SDValue();
14828
14829 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
14830 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
14831 InLaneMask);
14832 };
14833
14834 // First attempt a solution with full lanes.
14835 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
14836 return V;
14837
14838 // The rest of the solutions use sublanes.
14839 if (!CanUseSublanes)
14840 return SDValue();
14841
14842 // Then attempt a solution with 64-bit sublanes (vpermq).
14843 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
14844 return V;
14845
14846 // If that doesn't work and we have fast variable cross-lane shuffle,
14847 // attempt 32-bit sublanes (vpermd).
14848 if (!Subtarget.hasFastVariableCrossLaneShuffle())
14849 return SDValue();
14850
14851 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
14852}
14853
14854/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
14855static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
14856 SmallVector<int> &InLaneMask) {
14857 int Size = Mask.size();
14858 InLaneMask.assign(Mask.begin(), Mask.end());
14859 for (int i = 0; i < Size; ++i) {
14860 int &M = InLaneMask[i];
14861 if (M < 0)
14862 continue;
14863 if (((M % Size) / LaneSize) != (i / LaneSize))
14864 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
14865 }
14866}
14867
14868/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
14869/// source with a lane permutation.
14870///
14871/// This lowering strategy results in four instructions in the worst case for a
14872/// single-input cross lane shuffle which is lower than any other fully general
14873/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
14874/// shuffle pattern should be handled prior to trying this lowering.
14876 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14877 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14878 // FIXME: This should probably be generalized for 512-bit vectors as well.
14879 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
14880 int Size = Mask.size();
14881 int LaneSize = Size / 2;
14882
14883 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14884 // Only do this if the elements aren't all from the lower lane,
14885 // otherwise we're (probably) better off doing a split.
14886 if (VT == MVT::v4f64 &&
14887 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
14888 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
14889
14890 // If there are only inputs from one 128-bit lane, splitting will in fact be
14891 // less expensive. The flags track whether the given lane contains an element
14892 // that crosses to another lane.
14893 bool AllLanes;
14894 if (!Subtarget.hasAVX2()) {
14895 bool LaneCrossing[2] = {false, false};
14896 for (int i = 0; i < Size; ++i)
14897 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
14898 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
14899 AllLanes = LaneCrossing[0] && LaneCrossing[1];
14900 } else {
14901 bool LaneUsed[2] = {false, false};
14902 for (int i = 0; i < Size; ++i)
14903 if (Mask[i] >= 0)
14904 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
14905 AllLanes = LaneUsed[0] && LaneUsed[1];
14906 }
14907
14908 // TODO - we could support shuffling V2 in the Flipped input.
14909 assert(V2.isUndef() &&
14910 "This last part of this routine only works on single input shuffles");
14911
14912 SmallVector<int> InLaneMask;
14913 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
14914
14915 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
14916 "In-lane shuffle mask expected");
14917
14918 // If we're not using both lanes in each lane and the inlane mask is not
14919 // repeating, then we're better off splitting.
14920 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
14921 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14922 /*SimpleOnly*/ false);
14923
14924 // Flip the lanes, and shuffle the results which should now be in-lane.
14925 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
14926 SDValue Flipped = DAG.getBitcast(PVT, V1);
14927 Flipped =
14928 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
14929 Flipped = DAG.getBitcast(VT, Flipped);
14930 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
14931}
14932
14933/// Handle lowering 2-lane 128-bit shuffles.
14935 SDValue V2, ArrayRef<int> Mask,
14936 const APInt &Zeroable,
14937 const X86Subtarget &Subtarget,
14938 SelectionDAG &DAG) {
14939 if (V2.isUndef()) {
14940 // Attempt to match VBROADCAST*128 subvector broadcast load.
14941 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
14942 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
14943 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
14945 MVT MemVT = VT.getHalfNumVectorElementsVT();
14946 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
14947 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
14949 VT, MemVT, Ld, Ofs, DAG))
14950 return BcstLd;
14951 }
14952
14953 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
14954 if (Subtarget.hasAVX2())
14955 return SDValue();
14956 }
14957
14958 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
14959
14960 SmallVector<int, 4> WidenedMask;
14961 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
14962 return SDValue();
14963
14964 bool IsLowZero = (Zeroable & 0x3) == 0x3;
14965 bool IsHighZero = (Zeroable & 0xc) == 0xc;
14966
14967 // Try to use an insert into a zero vector.
14968 if (WidenedMask[0] == 0 && IsHighZero) {
14969 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14970 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14971 DAG.getIntPtrConstant(0, DL));
14972 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14973 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14974 DAG.getIntPtrConstant(0, DL));
14975 }
14976
14977 // TODO: If minimizing size and one of the inputs is a zero vector and the
14978 // the zero vector has only one use, we could use a VPERM2X128 to save the
14979 // instruction bytes needed to explicitly generate the zero vector.
14980
14981 // Blends are faster and handle all the non-lane-crossing cases.
14982 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
14983 Subtarget, DAG))
14984 return Blend;
14985
14986 // If either input operand is a zero vector, use VPERM2X128 because its mask
14987 // allows us to replace the zero input with an implicit zero.
14988 if (!IsLowZero && !IsHighZero) {
14989 // Check for patterns which can be matched with a single insert of a 128-bit
14990 // subvector.
14991 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
14992 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
14993
14994 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
14995 // this will likely become vinsertf128 which can't fold a 256-bit memop.
14996 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
14997 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14998 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14999 OnlyUsesV1 ? V1 : V2,
15000 DAG.getIntPtrConstant(0, DL));
15001 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15002 DAG.getIntPtrConstant(2, DL));
15003 }
15004 }
15005
15006 // Try to use SHUF128 if possible.
15007 if (Subtarget.hasVLX()) {
15008 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15009 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15010 ((WidenedMask[1] % 2) << 1);
15011 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15012 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15013 }
15014 }
15015 }
15016
15017 // Otherwise form a 128-bit permutation. After accounting for undefs,
15018 // convert the 64-bit shuffle mask selection values into 128-bit
15019 // selection bits by dividing the indexes by 2 and shifting into positions
15020 // defined by a vperm2*128 instruction's immediate control byte.
15021
15022 // The immediate permute control byte looks like this:
15023 // [1:0] - select 128 bits from sources for low half of destination
15024 // [2] - ignore
15025 // [3] - zero low half of destination
15026 // [5:4] - select 128 bits from sources for high half of destination
15027 // [6] - ignore
15028 // [7] - zero high half of destination
15029
15030 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15031 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15032
15033 unsigned PermMask = 0;
15034 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15035 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15036
15037 // Check the immediate mask and replace unused sources with undef.
15038 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15039 V1 = DAG.getUNDEF(VT);
15040 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15041 V2 = DAG.getUNDEF(VT);
15042
15043 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15044 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15045}
15046
15047/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15048/// shuffling each lane.
15049///
15050/// This attempts to create a repeated lane shuffle where each lane uses one
15051/// or two of the lanes of the inputs. The lanes of the input vectors are
15052/// shuffled in one or two independent shuffles to get the lanes into the
15053/// position needed by the final shuffle.
15055 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15056 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15057 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15058
15059 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15060 return SDValue();
15061
15062 int NumElts = Mask.size();
15063 int NumLanes = VT.getSizeInBits() / 128;
15064 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15065 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15066 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15067
15068 // First pass will try to fill in the RepeatMask from lanes that need two
15069 // sources.
15070 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15071 int Srcs[2] = {-1, -1};
15072 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15073 for (int i = 0; i != NumLaneElts; ++i) {
15074 int M = Mask[(Lane * NumLaneElts) + i];
15075 if (M < 0)
15076 continue;
15077 // Determine which of the possible input lanes (NumLanes from each source)
15078 // this element comes from. Assign that as one of the sources for this
15079 // lane. We can assign up to 2 sources for this lane. If we run out
15080 // sources we can't do anything.
15081 int LaneSrc = M / NumLaneElts;
15082 int Src;
15083 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15084 Src = 0;
15085 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15086 Src = 1;
15087 else
15088 return SDValue();
15089
15090 Srcs[Src] = LaneSrc;
15091 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15092 }
15093
15094 // If this lane has two sources, see if it fits with the repeat mask so far.
15095 if (Srcs[1] < 0)
15096 continue;
15097
15098 LaneSrcs[Lane][0] = Srcs[0];
15099 LaneSrcs[Lane][1] = Srcs[1];
15100
15101 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15102 assert(M1.size() == M2.size() && "Unexpected mask size");
15103 for (int i = 0, e = M1.size(); i != e; ++i)
15104 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15105 return false;
15106 return true;
15107 };
15108
15109 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15110 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15111 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15112 int M = Mask[i];
15113 if (M < 0)
15114 continue;
15115 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15116 "Unexpected mask element");
15117 MergedMask[i] = M;
15118 }
15119 };
15120
15121 if (MatchMasks(InLaneMask, RepeatMask)) {
15122 // Merge this lane mask into the final repeat mask.
15123 MergeMasks(InLaneMask, RepeatMask);
15124 continue;
15125 }
15126
15127 // Didn't find a match. Swap the operands and try again.
15128 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15130
15131 if (MatchMasks(InLaneMask, RepeatMask)) {
15132 // Merge this lane mask into the final repeat mask.
15133 MergeMasks(InLaneMask, RepeatMask);
15134 continue;
15135 }
15136
15137 // Couldn't find a match with the operands in either order.
15138 return SDValue();
15139 }
15140
15141 // Now handle any lanes with only one source.
15142 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15143 // If this lane has already been processed, skip it.
15144 if (LaneSrcs[Lane][0] >= 0)
15145 continue;
15146
15147 for (int i = 0; i != NumLaneElts; ++i) {
15148 int M = Mask[(Lane * NumLaneElts) + i];
15149 if (M < 0)
15150 continue;
15151
15152 // If RepeatMask isn't defined yet we can define it ourself.
15153 if (RepeatMask[i] < 0)
15154 RepeatMask[i] = M % NumLaneElts;
15155
15156 if (RepeatMask[i] < NumElts) {
15157 if (RepeatMask[i] != M % NumLaneElts)
15158 return SDValue();
15159 LaneSrcs[Lane][0] = M / NumLaneElts;
15160 } else {
15161 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15162 return SDValue();
15163 LaneSrcs[Lane][1] = M / NumLaneElts;
15164 }
15165 }
15166
15167 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15168 return SDValue();
15169 }
15170
15171 SmallVector<int, 16> NewMask(NumElts, -1);
15172 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15173 int Src = LaneSrcs[Lane][0];
15174 for (int i = 0; i != NumLaneElts; ++i) {
15175 int M = -1;
15176 if (Src >= 0)
15177 M = Src * NumLaneElts + i;
15178 NewMask[Lane * NumLaneElts + i] = M;
15179 }
15180 }
15181 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15182 // Ensure we didn't get back the shuffle we started with.
15183 // FIXME: This is a hack to make up for some splat handling code in
15184 // getVectorShuffle.
15185 if (isa<ShuffleVectorSDNode>(NewV1) &&
15186 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15187 return SDValue();
15188
15189 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15190 int Src = LaneSrcs[Lane][1];
15191 for (int i = 0; i != NumLaneElts; ++i) {
15192 int M = -1;
15193 if (Src >= 0)
15194 M = Src * NumLaneElts + i;
15195 NewMask[Lane * NumLaneElts + i] = M;
15196 }
15197 }
15198 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15199 // Ensure we didn't get back the shuffle we started with.
15200 // FIXME: This is a hack to make up for some splat handling code in
15201 // getVectorShuffle.
15202 if (isa<ShuffleVectorSDNode>(NewV2) &&
15203 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15204 return SDValue();
15205
15206 for (int i = 0; i != NumElts; ++i) {
15207 if (Mask[i] < 0) {
15208 NewMask[i] = -1;
15209 continue;
15210 }
15211 NewMask[i] = RepeatMask[i % NumLaneElts];
15212 if (NewMask[i] < 0)
15213 continue;
15214
15215 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15216 }
15217 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15218}
15219
15220/// If the input shuffle mask results in a vector that is undefined in all upper
15221/// or lower half elements and that mask accesses only 2 halves of the
15222/// shuffle's operands, return true. A mask of half the width with mask indexes
15223/// adjusted to access the extracted halves of the original shuffle operands is
15224/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15225/// lower half of each input operand is accessed.
15226static bool
15228 int &HalfIdx1, int &HalfIdx2) {
15229 assert((Mask.size() == HalfMask.size() * 2) &&
15230 "Expected input mask to be twice as long as output");
15231
15232 // Exactly one half of the result must be undef to allow narrowing.
15233 bool UndefLower = isUndefLowerHalf(Mask);
15234 bool UndefUpper = isUndefUpperHalf(Mask);
15235 if (UndefLower == UndefUpper)
15236 return false;
15237
15238 unsigned HalfNumElts = HalfMask.size();
15239 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15240 HalfIdx1 = -1;
15241 HalfIdx2 = -1;
15242 for (unsigned i = 0; i != HalfNumElts; ++i) {
15243 int M = Mask[i + MaskIndexOffset];
15244 if (M < 0) {
15245 HalfMask[i] = M;
15246 continue;
15247 }
15248
15249 // Determine which of the 4 half vectors this element is from.
15250 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15251 int HalfIdx = M / HalfNumElts;
15252
15253 // Determine the element index into its half vector source.
15254 int HalfElt = M % HalfNumElts;
15255
15256 // We can shuffle with up to 2 half vectors, set the new 'half'
15257 // shuffle mask accordingly.
15258 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15259 HalfMask[i] = HalfElt;
15260 HalfIdx1 = HalfIdx;
15261 continue;
15262 }
15263 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15264 HalfMask[i] = HalfElt + HalfNumElts;
15265 HalfIdx2 = HalfIdx;
15266 continue;
15267 }
15268
15269 // Too many half vectors referenced.
15270 return false;
15271 }
15272
15273 return true;
15274}
15275
15276/// Given the output values from getHalfShuffleMask(), create a half width
15277/// shuffle of extracted vectors followed by an insert back to full width.
15279 ArrayRef<int> HalfMask, int HalfIdx1,
15280 int HalfIdx2, bool UndefLower,
15281 SelectionDAG &DAG, bool UseConcat = false) {
15282 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15283 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15284
15285 MVT VT = V1.getSimpleValueType();
15286 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15287 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15288
15289 auto getHalfVector = [&](int HalfIdx) {
15290 if (HalfIdx < 0)
15291 return DAG.getUNDEF(HalfVT);
15292 SDValue V = (HalfIdx < 2 ? V1 : V2);
15293 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15294 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15295 DAG.getIntPtrConstant(HalfIdx, DL));
15296 };
15297
15298 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15299 SDValue Half1 = getHalfVector(HalfIdx1);
15300 SDValue Half2 = getHalfVector(HalfIdx2);
15301 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15302 if (UseConcat) {
15303 SDValue Op0 = V;
15304 SDValue Op1 = DAG.getUNDEF(HalfVT);
15305 if (UndefLower)
15306 std::swap(Op0, Op1);
15307 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15308 }
15309
15310 unsigned Offset = UndefLower ? HalfNumElts : 0;
15311 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15313}
15314
15315/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15316/// This allows for fast cases such as subvector extraction/insertion
15317/// or shuffling smaller vector types which can lower more efficiently.
15319 SDValue V2, ArrayRef<int> Mask,
15320 const X86Subtarget &Subtarget,
15321 SelectionDAG &DAG) {
15322 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15323 "Expected 256-bit or 512-bit vector");
15324
15325 bool UndefLower = isUndefLowerHalf(Mask);
15326 if (!UndefLower && !isUndefUpperHalf(Mask))
15327 return SDValue();
15328
15329 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15330 "Completely undef shuffle mask should have been simplified already");
15331
15332 // Upper half is undef and lower half is whole upper subvector.
15333 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15334 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15335 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15336 if (!UndefLower &&
15337 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15338 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15339 DAG.getIntPtrConstant(HalfNumElts, DL));
15340 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15341 DAG.getIntPtrConstant(0, DL));
15342 }
15343
15344 // Lower half is undef and upper half is whole lower subvector.
15345 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15346 if (UndefLower &&
15347 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15348 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15349 DAG.getIntPtrConstant(0, DL));
15350 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15351 DAG.getIntPtrConstant(HalfNumElts, DL));
15352 }
15353
15354 int HalfIdx1, HalfIdx2;
15355 SmallVector<int, 8> HalfMask(HalfNumElts);
15356 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15357 return SDValue();
15358
15359 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15360
15361 // Only shuffle the halves of the inputs when useful.
15362 unsigned NumLowerHalves =
15363 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15364 unsigned NumUpperHalves =
15365 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15366 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15367
15368 // Determine the larger pattern of undef/halves, then decide if it's worth
15369 // splitting the shuffle based on subtarget capabilities and types.
15370 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15371 if (!UndefLower) {
15372 // XXXXuuuu: no insert is needed.
15373 // Always extract lowers when setting lower - these are all free subreg ops.
15374 if (NumUpperHalves == 0)
15375 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15376 UndefLower, DAG);
15377
15378 if (NumUpperHalves == 1) {
15379 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15380 if (Subtarget.hasAVX2()) {
15381 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15382 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15383 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15384 (!isSingleSHUFPSMask(HalfMask) ||
15385 Subtarget.hasFastVariableCrossLaneShuffle()))
15386 return SDValue();
15387 // If this is a unary shuffle (assume that the 2nd operand is
15388 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15389 // are better off extracting the upper half of 1 operand and using a
15390 // narrow shuffle.
15391 if (EltWidth == 64 && V2.isUndef())
15392 return SDValue();
15393 }
15394 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15395 if (Subtarget.hasAVX512() && VT.is512BitVector())
15396 return SDValue();
15397 // Extract + narrow shuffle is better than the wide alternative.
15398 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15399 UndefLower, DAG);
15400 }
15401
15402 // Don't extract both uppers, instead shuffle and then extract.
15403 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15404 return SDValue();
15405 }
15406
15407 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15408 if (NumUpperHalves == 0) {
15409 // AVX2 has efficient 64-bit element cross-lane shuffles.
15410 // TODO: Refine to account for unary shuffle, splat, and other masks?
15411 if (Subtarget.hasAVX2() && EltWidth == 64)
15412 return SDValue();
15413 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15414 if (Subtarget.hasAVX512() && VT.is512BitVector())
15415 return SDValue();
15416 // Narrow shuffle + insert is better than the wide alternative.
15417 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15418 UndefLower, DAG);
15419 }
15420
15421 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15422 return SDValue();
15423}
15424
15425/// Handle case where shuffle sources are coming from the same 128-bit lane and
15426/// every lane can be represented as the same repeating mask - allowing us to
15427/// shuffle the sources with the repeating shuffle and then permute the result
15428/// to the destination lanes.
15430 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15431 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15432 int NumElts = VT.getVectorNumElements();
15433 int NumLanes = VT.getSizeInBits() / 128;
15434 int NumLaneElts = NumElts / NumLanes;
15435
15436 // On AVX2 we may be able to just shuffle the lowest elements and then
15437 // broadcast the result.
15438 if (Subtarget.hasAVX2()) {
15439 for (unsigned BroadcastSize : {16, 32, 64}) {
15440 if (BroadcastSize <= VT.getScalarSizeInBits())
15441 continue;
15442 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15443
15444 // Attempt to match a repeating pattern every NumBroadcastElts,
15445 // accounting for UNDEFs but only references the lowest 128-bit
15446 // lane of the inputs.
15447 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15448 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15449 for (int j = 0; j != NumBroadcastElts; ++j) {
15450 int M = Mask[i + j];
15451 if (M < 0)
15452 continue;
15453 int &R = RepeatMask[j];
15454 if (0 != ((M % NumElts) / NumLaneElts))
15455 return false;
15456 if (0 <= R && R != M)
15457 return false;
15458 R = M;
15459 }
15460 return true;
15461 };
15462
15463 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15464 if (!FindRepeatingBroadcastMask(RepeatMask))
15465 continue;
15466
15467 // Shuffle the (lowest) repeated elements in place for broadcast.
15468 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15469
15470 // Shuffle the actual broadcast.
15471 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15472 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15473 for (int j = 0; j != NumBroadcastElts; ++j)
15474 BroadcastMask[i + j] = j;
15475
15476 // Avoid returning the same shuffle operation. For example,
15477 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15478 if (BroadcastMask == Mask)
15479 return SDValue();
15480
15481 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15482 BroadcastMask);
15483 }
15484 }
15485
15486 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15487 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15488 return SDValue();
15489
15490 // Bail if we already have a repeated lane shuffle mask.
15491 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15492 return SDValue();
15493
15494 // Helper to look for repeated mask in each split sublane, and that those
15495 // sublanes can then be permuted into place.
15496 auto ShuffleSubLanes = [&](int SubLaneScale) {
15497 int NumSubLanes = NumLanes * SubLaneScale;
15498 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15499
15500 // Check that all the sources are coming from the same lane and see if we
15501 // can form a repeating shuffle mask (local to each sub-lane). At the same
15502 // time, determine the source sub-lane for each destination sub-lane.
15503 int TopSrcSubLane = -1;
15504 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15505 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15506 SubLaneScale,
15507 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15508
15509 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15510 // Extract the sub-lane mask, check that it all comes from the same lane
15511 // and normalize the mask entries to come from the first lane.
15512 int SrcLane = -1;
15513 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15514 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15515 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15516 if (M < 0)
15517 continue;
15518 int Lane = (M % NumElts) / NumLaneElts;
15519 if ((0 <= SrcLane) && (SrcLane != Lane))
15520 return SDValue();
15521 SrcLane = Lane;
15522 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15523 SubLaneMask[Elt] = LocalM;
15524 }
15525
15526 // Whole sub-lane is UNDEF.
15527 if (SrcLane < 0)
15528 continue;
15529
15530 // Attempt to match against the candidate repeated sub-lane masks.
15531 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15532 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15533 for (int i = 0; i != NumSubLaneElts; ++i) {
15534 if (M1[i] < 0 || M2[i] < 0)
15535 continue;
15536 if (M1[i] != M2[i])
15537 return false;
15538 }
15539 return true;
15540 };
15541
15542 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15543 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15544 continue;
15545
15546 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15547 for (int i = 0; i != NumSubLaneElts; ++i) {
15548 int M = SubLaneMask[i];
15549 if (M < 0)
15550 continue;
15551 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15552 "Unexpected mask element");
15553 RepeatedSubLaneMask[i] = M;
15554 }
15555
15556 // Track the top most source sub-lane - by setting the remaining to
15557 // UNDEF we can greatly simplify shuffle matching.
15558 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15559 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15560 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15561 break;
15562 }
15563
15564 // Bail if we failed to find a matching repeated sub-lane mask.
15565 if (Dst2SrcSubLanes[DstSubLane] < 0)
15566 return SDValue();
15567 }
15568 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15569 "Unexpected source lane");
15570
15571 // Create a repeating shuffle mask for the entire vector.
15572 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15573 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15574 int Lane = SubLane / SubLaneScale;
15575 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15576 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15577 int M = RepeatedSubLaneMask[Elt];
15578 if (M < 0)
15579 continue;
15580 int Idx = (SubLane * NumSubLaneElts) + Elt;
15581 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15582 }
15583 }
15584
15585 // Shuffle each source sub-lane to its destination.
15586 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15587 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15588 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15589 if (SrcSubLane < 0)
15590 continue;
15591 for (int j = 0; j != NumSubLaneElts; ++j)
15592 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15593 }
15594
15595 // Avoid returning the same shuffle operation.
15596 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15597 if (RepeatedMask == Mask || SubLaneMask == Mask)
15598 return SDValue();
15599
15600 SDValue RepeatedShuffle =
15601 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15602
15603 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15604 SubLaneMask);
15605 };
15606
15607 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15608 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15609 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15610 // Otherwise we can only permute whole 128-bit lanes.
15611 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15612 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15613 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15614 MinSubLaneScale = 2;
15615 MaxSubLaneScale =
15616 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15617 }
15618 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15619 MinSubLaneScale = MaxSubLaneScale = 4;
15620
15621 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15622 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15623 return Shuffle;
15624
15625 return SDValue();
15626}
15627
15629 bool &ForceV1Zero, bool &ForceV2Zero,
15630 unsigned &ShuffleImm, ArrayRef<int> Mask,
15631 const APInt &Zeroable) {
15632 int NumElts = VT.getVectorNumElements();
15633 assert(VT.getScalarSizeInBits() == 64 &&
15634 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15635 "Unexpected data type for VSHUFPD");
15636 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15637 "Illegal shuffle mask");
15638
15639 bool ZeroLane[2] = { true, true };
15640 for (int i = 0; i < NumElts; ++i)
15641 ZeroLane[i & 1] &= Zeroable[i];
15642
15643 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15644 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15645 ShuffleImm = 0;
15646 bool ShufpdMask = true;
15647 bool CommutableMask = true;
15648 for (int i = 0; i < NumElts; ++i) {
15649 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15650 continue;
15651 if (Mask[i] < 0)
15652 return false;
15653 int Val = (i & 6) + NumElts * (i & 1);
15654 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15655 if (Mask[i] < Val || Mask[i] > Val + 1)
15656 ShufpdMask = false;
15657 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15658 CommutableMask = false;
15659 ShuffleImm |= (Mask[i] % 2) << i;
15660 }
15661
15662 if (!ShufpdMask && !CommutableMask)
15663 return false;
15664
15665 if (!ShufpdMask && CommutableMask)
15666 std::swap(V1, V2);
15667
15668 ForceV1Zero = ZeroLane[0];
15669 ForceV2Zero = ZeroLane[1];
15670 return true;
15671}
15672
15674 SDValue V2, ArrayRef<int> Mask,
15675 const APInt &Zeroable,
15676 const X86Subtarget &Subtarget,
15677 SelectionDAG &DAG) {
15678 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15679 "Unexpected data type for VSHUFPD");
15680
15681 unsigned Immediate = 0;
15682 bool ForceV1Zero = false, ForceV2Zero = false;
15683 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15684 Mask, Zeroable))
15685 return SDValue();
15686
15687 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15688 if (ForceV1Zero)
15689 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15690 if (ForceV2Zero)
15691 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15692
15693 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15694 DAG.getTargetConstant(Immediate, DL, MVT::i8));
15695}
15696
15697// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15698// by zeroable elements in the remaining 24 elements. Turn this into two
15699// vmovqb instructions shuffled together.
15701 SDValue V1, SDValue V2,
15702 ArrayRef<int> Mask,
15703 const APInt &Zeroable,
15704 SelectionDAG &DAG) {
15705 assert(VT == MVT::v32i8 && "Unexpected type!");
15706
15707 // The first 8 indices should be every 8th element.
15708 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
15709 return SDValue();
15710
15711 // Remaining elements need to be zeroable.
15712 if (Zeroable.countl_one() < (Mask.size() - 8))
15713 return SDValue();
15714
15715 V1 = DAG.getBitcast(MVT::v4i64, V1);
15716 V2 = DAG.getBitcast(MVT::v4i64, V2);
15717
15718 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
15719 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
15720
15721 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
15722 // the upper bits of the result using an unpckldq.
15723 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
15724 { 0, 1, 2, 3, 16, 17, 18, 19,
15725 4, 5, 6, 7, 20, 21, 22, 23 });
15726 // Insert the unpckldq into a zero vector to widen to v32i8.
15727 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
15728 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
15729 DAG.getIntPtrConstant(0, DL));
15730}
15731
15732// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
15733// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
15734// =>
15735// ul = unpckl v1, v2
15736// uh = unpckh v1, v2
15737// a = vperm ul, uh
15738// b = vperm ul, uh
15739//
15740// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
15741// and permute. We cannot directly match v3 because it is split into two
15742// 256-bit vectors in earlier isel stages. Therefore, this function matches a
15743// pair of 256-bit shuffles and makes sure the masks are consecutive.
15744//
15745// Once unpck and permute nodes are created, the permute corresponding to this
15746// shuffle is returned, while the other permute replaces the other half of the
15747// shuffle in the selection dag.
15749 SDValue V1, SDValue V2,
15750 ArrayRef<int> Mask,
15751 SelectionDAG &DAG) {
15752 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
15753 VT != MVT::v32i8)
15754 return SDValue();
15755 // <B0, B1, B0+1, B1+1, ..., >
15756 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
15757 unsigned Begin1) {
15758 size_t Size = Mask.size();
15759 assert(Size % 2 == 0 && "Expected even mask size");
15760 for (unsigned I = 0; I < Size; I += 2) {
15761 if (Mask[I] != (int)(Begin0 + I / 2) ||
15762 Mask[I + 1] != (int)(Begin1 + I / 2))
15763 return false;
15764 }
15765 return true;
15766 };
15767 // Check which half is this shuffle node
15768 int NumElts = VT.getVectorNumElements();
15769 size_t FirstQtr = NumElts / 2;
15770 size_t ThirdQtr = NumElts + NumElts / 2;
15771 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
15772 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
15773 if (!IsFirstHalf && !IsSecondHalf)
15774 return SDValue();
15775
15776 // Find the intersection between shuffle users of V1 and V2.
15777 SmallVector<SDNode *, 2> Shuffles;
15778 for (SDNode *User : V1->uses())
15779 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
15780 User->getOperand(1) == V2)
15781 Shuffles.push_back(User);
15782 // Limit user size to two for now.
15783 if (Shuffles.size() != 2)
15784 return SDValue();
15785 // Find out which half of the 512-bit shuffles is each smaller shuffle
15786 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
15787 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
15788 SDNode *FirstHalf;
15789 SDNode *SecondHalf;
15790 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
15791 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
15792 FirstHalf = Shuffles[0];
15793 SecondHalf = Shuffles[1];
15794 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
15795 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
15796 FirstHalf = Shuffles[1];
15797 SecondHalf = Shuffles[0];
15798 } else {
15799 return SDValue();
15800 }
15801 // Lower into unpck and perm. Return the perm of this shuffle and replace
15802 // the other.
15803 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
15804 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
15805 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15806 DAG.getTargetConstant(0x20, DL, MVT::i8));
15807 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15808 DAG.getTargetConstant(0x31, DL, MVT::i8));
15809 if (IsFirstHalf) {
15810 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
15811 return Perm1;
15812 }
15813 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
15814 return Perm2;
15815}
15816
15817/// Handle lowering of 4-lane 64-bit floating point shuffles.
15818///
15819/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
15820/// isn't available.
15822 const APInt &Zeroable, SDValue V1, SDValue V2,
15823 const X86Subtarget &Subtarget,
15824 SelectionDAG &DAG) {
15825 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15826 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15827 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15828
15829 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
15830 Subtarget, DAG))
15831 return V;
15832
15833 if (V2.isUndef()) {
15834 // Check for being able to broadcast a single element.
15835 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
15836 Mask, Subtarget, DAG))
15837 return Broadcast;
15838
15839 // Use low duplicate instructions for masks that match their pattern.
15840 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15841 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
15842
15843 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
15844 // Non-half-crossing single input shuffles can be lowered with an
15845 // interleaved permutation.
15846 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
15847 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
15848 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
15849 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
15850 }
15851
15852 // With AVX2 we have direct support for this permutation.
15853 if (Subtarget.hasAVX2())
15854 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
15855 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15856
15857 // Try to create an in-lane repeating shuffle mask and then shuffle the
15858 // results into the target lanes.
15860 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15861 return V;
15862
15863 // Try to permute the lanes and then use a per-lane permute.
15864 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
15865 Mask, DAG, Subtarget))
15866 return V;
15867
15868 // Otherwise, fall back.
15869 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
15870 DAG, Subtarget);
15871 }
15872
15873 // Use dedicated unpack instructions for masks that match their pattern.
15874 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
15875 return V;
15876
15877 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
15878 Zeroable, Subtarget, DAG))
15879 return Blend;
15880
15881 // Check if the blend happens to exactly fit that of SHUFPD.
15882 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
15883 Zeroable, Subtarget, DAG))
15884 return Op;
15885
15886 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
15887 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
15888
15889 // If we have lane crossing shuffles AND they don't all come from the lower
15890 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15891 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
15892 // canonicalize to a blend of splat which isn't necessary for this combine.
15893 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
15894 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
15895 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
15896 (V2.getOpcode() != ISD::BUILD_VECTOR))
15897 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
15898
15899 // If we have one input in place, then we can permute the other input and
15900 // blend the result.
15901 if (V1IsInPlace || V2IsInPlace)
15902 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15903 Subtarget, DAG);
15904
15905 // Try to create an in-lane repeating shuffle mask and then shuffle the
15906 // results into the target lanes.
15908 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15909 return V;
15910
15911 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15912 // shuffle. However, if we have AVX2 and either inputs are already in place,
15913 // we will be able to shuffle even across lanes the other input in a single
15914 // instruction so skip this pattern.
15915 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
15917 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15918 return V;
15919
15920 // If we have VLX support, we can use VEXPAND.
15921 if (Subtarget.hasVLX())
15922 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
15923 DAG, Subtarget))
15924 return V;
15925
15926 // If we have AVX2 then we always want to lower with a blend because an v4 we
15927 // can fully permute the elements.
15928 if (Subtarget.hasAVX2())
15929 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15930 Subtarget, DAG);
15931
15932 // Otherwise fall back on generic lowering.
15933 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
15934 Subtarget, DAG);
15935}
15936
15937/// Handle lowering of 4-lane 64-bit integer shuffles.
15938///
15939/// This routine is only called when we have AVX2 and thus a reasonable
15940/// instruction set for v4i64 shuffling..
15942 const APInt &Zeroable, SDValue V1, SDValue V2,
15943 const X86Subtarget &Subtarget,
15944 SelectionDAG &DAG) {
15945 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15946 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15947 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15948 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
15949
15950 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15951 Subtarget, DAG))
15952 return V;
15953
15954 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
15955 Zeroable, Subtarget, DAG))
15956 return Blend;
15957
15958 // Check for being able to broadcast a single element.
15959 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
15960 Subtarget, DAG))
15961 return Broadcast;
15962
15963 // Try to use shift instructions if fast.
15964 if (Subtarget.preferLowerShuffleAsShift())
15965 if (SDValue Shift =
15966 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15967 Subtarget, DAG, /*BitwiseOnly*/ true))
15968 return Shift;
15969
15970 if (V2.isUndef()) {
15971 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
15972 // can use lower latency instructions that will operate on both lanes.
15973 SmallVector<int, 2> RepeatedMask;
15974 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
15975 SmallVector<int, 4> PSHUFDMask;
15976 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
15977 return DAG.getBitcast(
15978 MVT::v4i64,
15979 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
15980 DAG.getBitcast(MVT::v8i32, V1),
15981 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15982 }
15983
15984 // AVX2 provides a direct instruction for permuting a single input across
15985 // lanes.
15986 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
15987 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15988 }
15989
15990 // Try to use shift instructions.
15991 if (SDValue Shift =
15992 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
15993 DAG, /*BitwiseOnly*/ false))
15994 return Shift;
15995
15996 // If we have VLX support, we can use VALIGN or VEXPAND.
15997 if (Subtarget.hasVLX()) {
15998 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
15999 Zeroable, Subtarget, DAG))
16000 return Rotate;
16001
16002 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
16003 DAG, Subtarget))
16004 return V;
16005 }
16006
16007 // Try to use PALIGNR.
16008 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16009 Subtarget, DAG))
16010 return Rotate;
16011
16012 // Use dedicated unpack instructions for masks that match their pattern.
16013 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
16014 return V;
16015
16016 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16017 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16018
16019 // If we have one input in place, then we can permute the other input and
16020 // blend the result.
16021 if (V1IsInPlace || V2IsInPlace)
16022 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16023 Subtarget, DAG);
16024
16025 // Try to create an in-lane repeating shuffle mask and then shuffle the
16026 // results into the target lanes.
16028 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16029 return V;
16030
16031 // Try to lower to PERMQ(BLENDD(V1,V2)).
16032 if (SDValue V =
16033 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16034 return V;
16035
16036 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16037 // shuffle. However, if we have AVX2 and either inputs are already in place,
16038 // we will be able to shuffle even across lanes the other input in a single
16039 // instruction so skip this pattern.
16040 if (!V1IsInPlace && !V2IsInPlace)
16042 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16043 return Result;
16044
16045 // Otherwise fall back on generic blend lowering.
16046 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16047 Subtarget, DAG);
16048}
16049
16050/// Handle lowering of 8-lane 32-bit floating point shuffles.
16051///
16052/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16053/// isn't available.
16055 const APInt &Zeroable, SDValue V1, SDValue V2,
16056 const X86Subtarget &Subtarget,
16057 SelectionDAG &DAG) {
16058 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16059 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16060 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16061
16062 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16063 Zeroable, Subtarget, DAG))
16064 return Blend;
16065
16066 // Check for being able to broadcast a single element.
16067 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16068 Subtarget, DAG))
16069 return Broadcast;
16070
16071 if (!Subtarget.hasAVX2()) {
16072 SmallVector<int> InLaneMask;
16073 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16074
16075 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16076 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16077 /*SimpleOnly*/ true))
16078 return R;
16079 }
16080 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16081 Zeroable, Subtarget, DAG))
16082 return DAG.getBitcast(MVT::v8f32, ZExt);
16083
16084 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16085 // options to efficiently lower the shuffle.
16086 SmallVector<int, 4> RepeatedMask;
16087 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16088 assert(RepeatedMask.size() == 4 &&
16089 "Repeated masks must be half the mask width!");
16090
16091 // Use even/odd duplicate instructions for masks that match their pattern.
16092 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16093 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16094 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16095 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16096
16097 if (V2.isUndef())
16098 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16099 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16100
16101 // Use dedicated unpack instructions for masks that match their pattern.
16102 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
16103 return V;
16104
16105 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16106 // have already handled any direct blends.
16107 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16108 }
16109
16110 // Try to create an in-lane repeating shuffle mask and then shuffle the
16111 // results into the target lanes.
16113 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16114 return V;
16115
16116 // If we have a single input shuffle with different shuffle patterns in the
16117 // two 128-bit lanes use the variable mask to VPERMILPS.
16118 if (V2.isUndef()) {
16119 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16120 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16121 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16122 }
16123 if (Subtarget.hasAVX2()) {
16124 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16125 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16126 }
16127 // Otherwise, fall back.
16128 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16129 DAG, Subtarget);
16130 }
16131
16132 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16133 // shuffle.
16135 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16136 return Result;
16137
16138 // If we have VLX support, we can use VEXPAND.
16139 if (Subtarget.hasVLX())
16140 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
16141 DAG, Subtarget))
16142 return V;
16143
16144 // Try to match an interleave of two v8f32s and lower them as unpck and
16145 // permutes using ymms. This needs to go before we try to split the vectors.
16146 //
16147 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
16148 // this path inadvertently.
16149 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
16150 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16151 Mask, DAG))
16152 return V;
16153
16154 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16155 // since after split we get a more efficient code using vpunpcklwd and
16156 // vpunpckhwd instrs than vblend.
16157 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16158 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
16159 DAG);
16160
16161 // If we have AVX2 then we always want to lower with a blend because at v8 we
16162 // can fully permute the elements.
16163 if (Subtarget.hasAVX2())
16164 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16165 Subtarget, DAG);
16166
16167 // Otherwise fall back on generic lowering.
16168 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16169 Subtarget, DAG);
16170}
16171
16172/// Handle lowering of 8-lane 32-bit integer shuffles.
16173///
16174/// This routine is only called when we have AVX2 and thus a reasonable
16175/// instruction set for v8i32 shuffling..
16177 const APInt &Zeroable, SDValue V1, SDValue V2,
16178 const X86Subtarget &Subtarget,
16179 SelectionDAG &DAG) {
16180 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16181 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16182 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16183 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16184
16185 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16186
16187 // Whenever we can lower this as a zext, that instruction is strictly faster
16188 // than any alternative. It also allows us to fold memory operands into the
16189 // shuffle in many cases.
16190 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16191 Zeroable, Subtarget, DAG))
16192 return ZExt;
16193
16194 // Try to match an interleave of two v8i32s and lower them as unpck and
16195 // permutes using ymms. This needs to go before we try to split the vectors.
16196 if (!Subtarget.hasAVX512())
16197 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16198 Mask, DAG))
16199 return V;
16200
16201 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16202 // since after split we get a more efficient code than vblend by using
16203 // vpunpcklwd and vpunpckhwd instrs.
16204 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16205 !Subtarget.hasAVX512())
16206 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
16207 DAG);
16208
16209 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16210 Zeroable, Subtarget, DAG))
16211 return Blend;
16212
16213 // Check for being able to broadcast a single element.
16214 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16215 Subtarget, DAG))
16216 return Broadcast;
16217
16218 // Try to use shift instructions if fast.
16219 if (Subtarget.preferLowerShuffleAsShift()) {
16220 if (SDValue Shift =
16221 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16222 Subtarget, DAG, /*BitwiseOnly*/ true))
16223 return Shift;
16224 if (NumV2Elements == 0)
16225 if (SDValue Rotate =
16226 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16227 return Rotate;
16228 }
16229
16230 // If the shuffle mask is repeated in each 128-bit lane we can use more
16231 // efficient instructions that mirror the shuffles across the two 128-bit
16232 // lanes.
16233 SmallVector<int, 4> RepeatedMask;
16234 bool Is128BitLaneRepeatedShuffle =
16235 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16236 if (Is128BitLaneRepeatedShuffle) {
16237 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16238 if (V2.isUndef())
16239 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16240 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16241
16242 // Use dedicated unpack instructions for masks that match their pattern.
16243 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
16244 return V;
16245 }
16246
16247 // Try to use shift instructions.
16248 if (SDValue Shift =
16249 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16250 DAG, /*BitwiseOnly*/ false))
16251 return Shift;
16252
16253 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16254 if (SDValue Rotate =
16255 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16256 return Rotate;
16257
16258 // If we have VLX support, we can use VALIGN or EXPAND.
16259 if (Subtarget.hasVLX()) {
16260 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16261 Zeroable, Subtarget, DAG))
16262 return Rotate;
16263
16264 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
16265 DAG, Subtarget))
16266 return V;
16267 }
16268
16269 // Try to use byte rotation instructions.
16270 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16271 Subtarget, DAG))
16272 return Rotate;
16273
16274 // Try to create an in-lane repeating shuffle mask and then shuffle the
16275 // results into the target lanes.
16277 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16278 return V;
16279
16280 if (V2.isUndef()) {
16281 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16282 // because that should be faster than the variable permute alternatives.
16283 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
16284 return V;
16285
16286 // If the shuffle patterns aren't repeated but it's a single input, directly
16287 // generate a cross-lane VPERMD instruction.
16288 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16289 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16290 }
16291
16292 // Assume that a single SHUFPS is faster than an alternative sequence of
16293 // multiple instructions (even if the CPU has a domain penalty).
16294 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16295 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16296 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16297 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16298 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16299 CastV1, CastV2, DAG);
16300 return DAG.getBitcast(MVT::v8i32, ShufPS);
16301 }
16302
16303 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16304 // shuffle.
16306 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16307 return Result;
16308
16309 // Otherwise fall back on generic blend lowering.
16310 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16311 Subtarget, DAG);
16312}
16313
16314/// Handle lowering of 16-lane 16-bit integer shuffles.
16315///
16316/// This routine is only called when we have AVX2 and thus a reasonable
16317/// instruction set for v16i16 shuffling..
16319 const APInt &Zeroable, SDValue V1, SDValue V2,
16320 const X86Subtarget &Subtarget,
16321 SelectionDAG &DAG) {
16322 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16323 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16324 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16325 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16326
16327 // Whenever we can lower this as a zext, that instruction is strictly faster
16328 // than any alternative. It also allows us to fold memory operands into the
16329 // shuffle in many cases.
16331 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16332 return ZExt;
16333
16334 // Check for being able to broadcast a single element.
16335 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16336 Subtarget, DAG))
16337 return Broadcast;
16338
16339 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16340 Zeroable, Subtarget, DAG))
16341 return Blend;
16342
16343 // Use dedicated unpack instructions for masks that match their pattern.
16344 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
16345 return V;
16346
16347 // Use dedicated pack instructions for masks that match their pattern.
16348 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
16349 Subtarget))
16350 return V;
16351
16352 // Try to use lower using a truncation.
16353 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16354 Subtarget, DAG))
16355 return V;
16356
16357 // Try to use shift instructions.
16358 if (SDValue Shift =
16359 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16360 Subtarget, DAG, /*BitwiseOnly*/ false))
16361 return Shift;
16362
16363 // Try to use byte rotation instructions.
16364 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16365 Subtarget, DAG))
16366 return Rotate;
16367
16368 // Try to create an in-lane repeating shuffle mask and then shuffle the
16369 // results into the target lanes.
16371 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16372 return V;
16373
16374 if (V2.isUndef()) {
16375 // Try to use bit rotation instructions.
16376 if (SDValue Rotate =
16377 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16378 return Rotate;
16379
16380 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16381 // because that should be faster than the variable permute alternatives.
16382 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
16383 return V;
16384
16385 // There are no generalized cross-lane shuffle operations available on i16
16386 // element types.
16387 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16389 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16390 return V;
16391
16392 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16393 DAG, Subtarget);
16394 }
16395
16396 SmallVector<int, 8> RepeatedMask;
16397 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16398 // As this is a single-input shuffle, the repeated mask should be
16399 // a strictly valid v8i16 mask that we can pass through to the v8i16
16400 // lowering to handle even the v16 case.
16402 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16403 }
16404 }
16405
16406 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16407 Zeroable, Subtarget, DAG))
16408 return PSHUFB;
16409
16410 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16411 if (Subtarget.hasBWI())
16412 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16413
16414 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16415 // shuffle.
16417 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16418 return Result;
16419
16420 // Try to permute the lanes and then use a per-lane permute.
16422 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16423 return V;
16424
16425 // Try to match an interleave of two v16i16s and lower them as unpck and
16426 // permutes using ymms.
16427 if (!Subtarget.hasAVX512())
16428 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16429 Mask, DAG))
16430 return V;
16431
16432 // Otherwise fall back on generic lowering.
16433 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16434 Subtarget, DAG);
16435}
16436
16437/// Handle lowering of 32-lane 8-bit integer shuffles.
16438///
16439/// This routine is only called when we have AVX2 and thus a reasonable
16440/// instruction set for v32i8 shuffling..
16442 const APInt &Zeroable, SDValue V1, SDValue V2,
16443 const X86Subtarget &Subtarget,
16444 SelectionDAG &DAG) {
16445 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16446 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16447 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16448 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16449
16450 // Whenever we can lower this as a zext, that instruction is strictly faster
16451 // than any alternative. It also allows us to fold memory operands into the
16452 // shuffle in many cases.
16453 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16454 Zeroable, Subtarget, DAG))
16455 return ZExt;
16456
16457 // Check for being able to broadcast a single element.
16458 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16459 Subtarget, DAG))
16460 return Broadcast;
16461
16462 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16463 Zeroable, Subtarget, DAG))
16464 return Blend;
16465
16466 // Use dedicated unpack instructions for masks that match their pattern.
16467 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
16468 return V;
16469
16470 // Use dedicated pack instructions for masks that match their pattern.
16471 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
16472 Subtarget))
16473 return V;
16474
16475 // Try to use lower using a truncation.
16476 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16477 Subtarget, DAG))
16478 return V;
16479
16480 // Try to use shift instructions.
16481 if (SDValue Shift =
16482 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16483 DAG, /*BitwiseOnly*/ false))
16484 return Shift;
16485
16486 // Try to use byte rotation instructions.
16487 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16488 Subtarget, DAG))
16489 return Rotate;
16490
16491 // Try to use bit rotation instructions.
16492 if (V2.isUndef())
16493 if (SDValue Rotate =
16494 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16495 return Rotate;
16496
16497 // Try to create an in-lane repeating shuffle mask and then shuffle the
16498 // results into the target lanes.
16500 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16501 return V;
16502
16503 // There are no generalized cross-lane shuffle operations available on i8
16504 // element types.
16505 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16506 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16507 // because that should be faster than the variable permute alternatives.
16508 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
16509 return V;
16510
16512 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16513 return V;
16514
16515 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16516 DAG, Subtarget);
16517 }
16518
16519 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16520 Zeroable, Subtarget, DAG))
16521 return PSHUFB;
16522
16523 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16524 if (Subtarget.hasVBMI())
16525 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16526
16527 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16528 // shuffle.
16530 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16531 return Result;
16532
16533 // Try to permute the lanes and then use a per-lane permute.
16535 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16536 return V;
16537
16538 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16539 // by zeroable elements in the remaining 24 elements. Turn this into two
16540 // vmovqb instructions shuffled together.
16541 if (Subtarget.hasVLX())
16542 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16543 Mask, Zeroable, DAG))
16544 return V;
16545
16546 // Try to match an interleave of two v32i8s and lower them as unpck and
16547 // permutes using ymms.
16548 if (!Subtarget.hasAVX512())
16549 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16550 Mask, DAG))
16551 return V;
16552
16553 // Otherwise fall back on generic lowering.
16554 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16555 Subtarget, DAG);
16556}
16557
16558/// High-level routine to lower various 256-bit x86 vector shuffles.
16559///
16560/// This routine either breaks down the specific type of a 256-bit x86 vector
16561/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16562/// together based on the available instructions.
16564 SDValue V1, SDValue V2, const APInt &Zeroable,
16565 const X86Subtarget &Subtarget,
16566 SelectionDAG &DAG) {
16567 // If we have a single input to the zero element, insert that into V1 if we
16568 // can do so cheaply.
16569 int NumElts = VT.getVectorNumElements();
16570 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16571
16572 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16574 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16575 return Insertion;
16576
16577 // Handle special cases where the lower or upper half is UNDEF.
16578 if (SDValue V =
16579 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16580 return V;
16581
16582 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16583 // can check for those subtargets here and avoid much of the subtarget
16584 // querying in the per-vector-type lowering routines. With AVX1 we have
16585 // essentially *zero* ability to manipulate a 256-bit vector with integer
16586 // types. Since we'll use floating point types there eventually, just
16587 // immediately cast everything to a float and operate entirely in that domain.
16588 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16589 int ElementBits = VT.getScalarSizeInBits();
16590 if (ElementBits < 32) {
16591 // No floating point type available, if we can't use the bit operations
16592 // for masking/blending then decompose into 128-bit vectors.
16593 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16594 Subtarget, DAG))
16595 return V;
16596 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16597 return V;
16598 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16599 }
16600
16601 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16603 V1 = DAG.getBitcast(FpVT, V1);
16604 V2 = DAG.getBitcast(FpVT, V2);
16605 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16606 }
16607
16608 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16609 V1 = DAG.getBitcast(MVT::v16i16, V1);
16610 V2 = DAG.getBitcast(MVT::v16i16, V2);
16611 return DAG.getBitcast(VT,
16612 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16613 }
16614
16615 switch (VT.SimpleTy) {
16616 case MVT::v4f64:
16617 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16618 case MVT::v4i64:
16619 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16620 case MVT::v8f32:
16621 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16622 case MVT::v8i32:
16623 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16624 case MVT::v16i16:
16625 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16626 case MVT::v32i8:
16627 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16628
16629 default:
16630 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16631 }
16632}
16633
16634/// Try to lower a vector shuffle as a 128-bit shuffles.
16636 const APInt &Zeroable, SDValue V1, SDValue V2,
16637 const X86Subtarget &Subtarget,
16638 SelectionDAG &DAG) {
16639 assert(VT.getScalarSizeInBits() == 64 &&
16640 "Unexpected element type size for 128bit shuffle.");
16641
16642 // To handle 256 bit vector requires VLX and most probably
16643 // function lowerV2X128VectorShuffle() is better solution.
16644 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16645
16646 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16647 SmallVector<int, 4> Widened128Mask;
16648 if (!canWidenShuffleElements(Mask, Widened128Mask))
16649 return SDValue();
16650 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16651
16652 // Try to use an insert into a zero vector.
16653 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16654 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16655 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16656 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16657 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16658 DAG.getIntPtrConstant(0, DL));
16659 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16660 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16661 DAG.getIntPtrConstant(0, DL));
16662 }
16663
16664 // Check for patterns which can be matched with a single insert of a 256-bit
16665 // subvector.
16666 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16667 if (OnlyUsesV1 ||
16668 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16669 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16670 SDValue SubVec =
16671 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16672 DAG.getIntPtrConstant(0, DL));
16673 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16674 DAG.getIntPtrConstant(4, DL));
16675 }
16676
16677 // See if this is an insertion of the lower 128-bits of V2 into V1.
16678 bool IsInsert = true;
16679 int V2Index = -1;
16680 for (int i = 0; i < 4; ++i) {
16681 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16682 if (Widened128Mask[i] < 0)
16683 continue;
16684
16685 // Make sure all V1 subvectors are in place.
16686 if (Widened128Mask[i] < 4) {
16687 if (Widened128Mask[i] != i) {
16688 IsInsert = false;
16689 break;
16690 }
16691 } else {
16692 // Make sure we only have a single V2 index and its the lowest 128-bits.
16693 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16694 IsInsert = false;
16695 break;
16696 }
16697 V2Index = i;
16698 }
16699 }
16700 if (IsInsert && V2Index >= 0) {
16701 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16702 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16703 DAG.getIntPtrConstant(0, DL));
16704 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16705 }
16706
16707 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
16708 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
16709 // possible we at least ensure the lanes stay sequential to help later
16710 // combines.
16711 SmallVector<int, 2> Widened256Mask;
16712 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
16713 Widened128Mask.clear();
16714 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
16715 }
16716
16717 // Try to lower to vshuf64x2/vshuf32x4.
16718 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
16719 int PermMask[4] = {-1, -1, -1, -1};
16720 // Ensure elements came from the same Op.
16721 for (int i = 0; i < 4; ++i) {
16722 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16723 if (Widened128Mask[i] < 0)
16724 continue;
16725
16726 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
16727 unsigned OpIndex = i / 2;
16728 if (Ops[OpIndex].isUndef())
16729 Ops[OpIndex] = Op;
16730 else if (Ops[OpIndex] != Op)
16731 return SDValue();
16732
16733 PermMask[i] = Widened128Mask[i] % 4;
16734 }
16735
16736 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
16737 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
16738}
16739
16740/// Handle lowering of 8-lane 64-bit floating point shuffles.
16742 const APInt &Zeroable, SDValue V1, SDValue V2,
16743 const X86Subtarget &Subtarget,
16744 SelectionDAG &DAG) {
16745 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16746 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16747 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16748
16749 if (V2.isUndef()) {
16750 // Use low duplicate instructions for masks that match their pattern.
16751 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
16752 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
16753
16754 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
16755 // Non-half-crossing single input shuffles can be lowered with an
16756 // interleaved permutation.
16757 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16758 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
16759 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
16760 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
16761 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
16762 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16763 }
16764
16765 SmallVector<int, 4> RepeatedMask;
16766 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
16767 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
16768 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16769 }
16770
16771 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
16772 V2, Subtarget, DAG))
16773 return Shuf128;
16774
16775 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
16776 return Unpck;
16777
16778 // Check if the blend happens to exactly fit that of SHUFPD.
16779 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
16780 Zeroable, Subtarget, DAG))
16781 return Op;
16782
16783 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
16784 DAG, Subtarget))
16785 return V;
16786
16787 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
16788 Zeroable, Subtarget, DAG))
16789 return Blend;
16790
16791 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
16792}
16793
16794/// Handle lowering of 16-lane 32-bit floating point shuffles.
16796 const APInt &Zeroable, SDValue V1, SDValue V2,
16797 const X86Subtarget &Subtarget,
16798 SelectionDAG &DAG) {
16799 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16800 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16801 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16802
16803 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16804 // options to efficiently lower the shuffle.
16805 SmallVector<int, 4> RepeatedMask;
16806 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
16807 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16808
16809 // Use even/odd duplicate instructions for masks that match their pattern.
16810 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16811 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
16812 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16813 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
16814
16815 if (V2.isUndef())
16816 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
16817 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16818
16819 // Use dedicated unpack instructions for masks that match their pattern.
16820 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
16821 return V;
16822
16823 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16824 Zeroable, Subtarget, DAG))
16825 return Blend;
16826
16827 // Otherwise, fall back to a SHUFPS sequence.
16828 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
16829 }
16830
16831 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16832 Zeroable, Subtarget, DAG))
16833 return Blend;
16834
16836 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16837 return DAG.getBitcast(MVT::v16f32, ZExt);
16838
16839 // Try to create an in-lane repeating shuffle mask and then shuffle the
16840 // results into the target lanes.
16842 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
16843 return V;
16844
16845 // If we have a single input shuffle with different shuffle patterns in the
16846 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16847 if (V2.isUndef() &&
16848 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
16849 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
16850 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
16851 }
16852
16853 // If we have AVX512F support, we can use VEXPAND.
16854 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
16855 V1, V2, DAG, Subtarget))
16856 return V;
16857
16858 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
16859}
16860
16861/// Handle lowering of 8-lane 64-bit integer shuffles.
16863 const APInt &Zeroable, SDValue V1, SDValue V2,
16864 const X86Subtarget &Subtarget,
16865 SelectionDAG &DAG) {
16866 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16867 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16868 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16869
16870 // Try to use shift instructions if fast.
16871 if (Subtarget.preferLowerShuffleAsShift())
16872 if (SDValue Shift =
16873 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
16874 Subtarget, DAG, /*BitwiseOnly*/ true))
16875 return Shift;
16876
16877 if (V2.isUndef()) {
16878 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16879 // can use lower latency instructions that will operate on all four
16880 // 128-bit lanes.
16881 SmallVector<int, 2> Repeated128Mask;
16882 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
16883 SmallVector<int, 4> PSHUFDMask;
16884 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
16885 return DAG.getBitcast(
16886 MVT::v8i64,
16887 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
16888 DAG.getBitcast(MVT::v16i32, V1),
16889 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16890 }
16891
16892 SmallVector<int, 4> Repeated256Mask;
16893 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
16894 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
16895 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
16896 }
16897
16898 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
16899 V2, Subtarget, DAG))
16900 return Shuf128;
16901
16902 // Try to use shift instructions.
16903 if (SDValue Shift =
16904 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
16905 DAG, /*BitwiseOnly*/ false))
16906 return Shift;
16907
16908 // Try to use VALIGN.
16909 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
16910 Zeroable, Subtarget, DAG))
16911 return Rotate;
16912
16913 // Try to use PALIGNR.
16914 if (Subtarget.hasBWI())
16915 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
16916 Subtarget, DAG))
16917 return Rotate;
16918
16919 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
16920 return Unpck;
16921
16922 // If we have AVX512F support, we can use VEXPAND.
16923 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
16924 DAG, Subtarget))
16925 return V;
16926
16927 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
16928 Zeroable, Subtarget, DAG))
16929 return Blend;
16930
16931 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
16932}
16933
16934/// Handle lowering of 16-lane 32-bit integer shuffles.
16936 const APInt &Zeroable, SDValue V1, SDValue V2,
16937 const X86Subtarget &Subtarget,
16938 SelectionDAG &DAG) {
16939 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16940 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16941 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16942
16943 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16944
16945 // Whenever we can lower this as a zext, that instruction is strictly faster
16946 // than any alternative. It also allows us to fold memory operands into the
16947 // shuffle in many cases.
16949 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16950 return ZExt;
16951
16952 // Try to use shift instructions if fast.
16953 if (Subtarget.preferLowerShuffleAsShift()) {
16954 if (SDValue Shift =
16955 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16956 Subtarget, DAG, /*BitwiseOnly*/ true))
16957 return Shift;
16958 if (NumV2Elements == 0)
16959 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
16960 Subtarget, DAG))
16961 return Rotate;
16962 }
16963
16964 // If the shuffle mask is repeated in each 128-bit lane we can use more
16965 // efficient instructions that mirror the shuffles across the four 128-bit
16966 // lanes.
16967 SmallVector<int, 4> RepeatedMask;
16968 bool Is128BitLaneRepeatedShuffle =
16969 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
16970 if (Is128BitLaneRepeatedShuffle) {
16971 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16972 if (V2.isUndef())
16973 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
16974 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16975
16976 // Use dedicated unpack instructions for masks that match their pattern.
16977 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
16978 return V;
16979 }
16980
16981 // Try to use shift instructions.
16982 if (SDValue Shift =
16983 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16984 Subtarget, DAG, /*BitwiseOnly*/ false))
16985 return Shift;
16986
16987 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
16988 if (SDValue Rotate =
16989 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
16990 return Rotate;
16991
16992 // Try to use VALIGN.
16993 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
16994 Zeroable, Subtarget, DAG))
16995 return Rotate;
16996
16997 // Try to use byte rotation instructions.
16998 if (Subtarget.hasBWI())
16999 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17000 Subtarget, DAG))
17001 return Rotate;
17002
17003 // Assume that a single SHUFPS is faster than using a permv shuffle.
17004 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17005 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17006 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17007 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17008 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17009 CastV1, CastV2, DAG);
17010 return DAG.getBitcast(MVT::v16i32, ShufPS);
17011 }
17012
17013 // Try to create an in-lane repeating shuffle mask and then shuffle the
17014 // results into the target lanes.
17016 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17017 return V;
17018
17019 // If we have AVX512F support, we can use VEXPAND.
17020 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17021 DAG, Subtarget))
17022 return V;
17023
17024 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17025 Zeroable, Subtarget, DAG))
17026 return Blend;
17027
17028 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17029}
17030
17031/// Handle lowering of 32-lane 16-bit integer shuffles.
17033 const APInt &Zeroable, SDValue V1, SDValue V2,
17034 const X86Subtarget &Subtarget,
17035 SelectionDAG &DAG) {
17036 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17037 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17038 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17039 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17040
17041 // Whenever we can lower this as a zext, that instruction is strictly faster
17042 // than any alternative. It also allows us to fold memory operands into the
17043 // shuffle in many cases.
17045 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17046 return ZExt;
17047
17048 // Use dedicated unpack instructions for masks that match their pattern.
17049 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17050 return V;
17051
17052 // Use dedicated pack instructions for masks that match their pattern.
17053 if (SDValue V =
17054 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17055 return V;
17056
17057 // Try to use shift instructions.
17058 if (SDValue Shift =
17059 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17060 Subtarget, DAG, /*BitwiseOnly*/ false))
17061 return Shift;
17062
17063 // Try to use byte rotation instructions.
17064 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17065 Subtarget, DAG))
17066 return Rotate;
17067
17068 if (V2.isUndef()) {
17069 // Try to use bit rotation instructions.
17070 if (SDValue Rotate =
17071 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17072 return Rotate;
17073
17074 SmallVector<int, 8> RepeatedMask;
17075 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17076 // As this is a single-input shuffle, the repeated mask should be
17077 // a strictly valid v8i16 mask that we can pass through to the v8i16
17078 // lowering to handle even the v32 case.
17079 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17080 RepeatedMask, Subtarget, DAG);
17081 }
17082 }
17083
17084 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17085 Zeroable, Subtarget, DAG))
17086 return Blend;
17087
17088 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17089 Zeroable, Subtarget, DAG))
17090 return PSHUFB;
17091
17092 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17093}
17094
17095/// Handle lowering of 64-lane 8-bit integer shuffles.
17097 const APInt &Zeroable, SDValue V1, SDValue V2,
17098 const X86Subtarget &Subtarget,
17099 SelectionDAG &DAG) {
17100 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17101 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17102 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17103 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17104
17105 // Whenever we can lower this as a zext, that instruction is strictly faster
17106 // than any alternative. It also allows us to fold memory operands into the
17107 // shuffle in many cases.
17109 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17110 return ZExt;
17111
17112 // Use dedicated unpack instructions for masks that match their pattern.
17113 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
17114 return V;
17115
17116 // Use dedicated pack instructions for masks that match their pattern.
17117 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
17118 Subtarget))
17119 return V;
17120
17121 // Try to use shift instructions.
17122 if (SDValue Shift =
17123 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17124 DAG, /*BitwiseOnly*/ false))
17125 return Shift;
17126
17127 // Try to use byte rotation instructions.
17128 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17129 Subtarget, DAG))
17130 return Rotate;
17131
17132 // Try to use bit rotation instructions.
17133 if (V2.isUndef())
17134 if (SDValue Rotate =
17135 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17136 return Rotate;
17137
17138 // Lower as AND if possible.
17139 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17140 Zeroable, Subtarget, DAG))
17141 return Masked;
17142
17143 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17144 Zeroable, Subtarget, DAG))
17145 return PSHUFB;
17146
17147 // Try to create an in-lane repeating shuffle mask and then shuffle the
17148 // results into the target lanes.
17150 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17151 return V;
17152
17154 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17155 return Result;
17156
17157 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17158 Zeroable, Subtarget, DAG))
17159 return Blend;
17160
17161 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17162 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17163 // PALIGNR will be cheaper than the second PSHUFB+OR.
17164 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17165 Mask, Subtarget, DAG))
17166 return V;
17167
17168 // If we can't directly blend but can use PSHUFB, that will be better as it
17169 // can both shuffle and set up the inefficient blend.
17170 bool V1InUse, V2InUse;
17171 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17172 DAG, V1InUse, V2InUse);
17173 }
17174
17175 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17176 // shuffle.
17177 if (!V2.isUndef())
17179 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17180 return Result;
17181
17182 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17183 if (Subtarget.hasVBMI())
17184 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17185
17186 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17187}
17188
17189/// High-level routine to lower various 512-bit x86 vector shuffles.
17190///
17191/// This routine either breaks down the specific type of a 512-bit x86 vector
17192/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17193/// together based on the available instructions.
17195 MVT VT, SDValue V1, SDValue V2,
17196 const APInt &Zeroable,
17197 const X86Subtarget &Subtarget,
17198 SelectionDAG &DAG) {
17199 assert(Subtarget.hasAVX512() &&
17200 "Cannot lower 512-bit vectors w/ basic ISA!");
17201
17202 // If we have a single input to the zero element, insert that into V1 if we
17203 // can do so cheaply.
17204 int NumElts = Mask.size();
17205 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17206
17207 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17209 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17210 return Insertion;
17211
17212 // Handle special cases where the lower or upper half is UNDEF.
17213 if (SDValue V =
17214 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17215 return V;
17216
17217 // Check for being able to broadcast a single element.
17218 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17219 Subtarget, DAG))
17220 return Broadcast;
17221
17222 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17223 // Try using bit ops for masking and blending before falling back to
17224 // splitting.
17225 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17226 Subtarget, DAG))
17227 return V;
17228 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17229 return V;
17230
17231 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17232 }
17233
17234 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17235 if (!Subtarget.hasBWI())
17236 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17237 /*SimpleOnly*/ false);
17238
17239 V1 = DAG.getBitcast(MVT::v32i16, V1);
17240 V2 = DAG.getBitcast(MVT::v32i16, V2);
17241 return DAG.getBitcast(VT,
17242 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17243 }
17244
17245 // Dispatch to each element type for lowering. If we don't have support for
17246 // specific element type shuffles at 512 bits, immediately split them and
17247 // lower them. Each lowering routine of a given type is allowed to assume that
17248 // the requisite ISA extensions for that element type are available.
17249 switch (VT.SimpleTy) {
17250 case MVT::v8f64:
17251 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17252 case MVT::v16f32:
17253 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17254 case MVT::v8i64:
17255 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17256 case MVT::v16i32:
17257 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17258 case MVT::v32i16:
17259 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17260 case MVT::v64i8:
17261 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17262
17263 default:
17264 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17265 }
17266}
17267
17269 MVT VT, SDValue V1, SDValue V2,
17270 const X86Subtarget &Subtarget,
17271 SelectionDAG &DAG) {
17272 // Shuffle should be unary.
17273 if (!V2.isUndef())
17274 return SDValue();
17275
17276 int ShiftAmt = -1;
17277 int NumElts = Mask.size();
17278 for (int i = 0; i != NumElts; ++i) {
17279 int M = Mask[i];
17280 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17281 "Unexpected mask index.");
17282 if (M < 0)
17283 continue;
17284
17285 // The first non-undef element determines our shift amount.
17286 if (ShiftAmt < 0) {
17287 ShiftAmt = M - i;
17288 // Need to be shifting right.
17289 if (ShiftAmt <= 0)
17290 return SDValue();
17291 }
17292 // All non-undef elements must shift by the same amount.
17293 if (ShiftAmt != M - i)
17294 return SDValue();
17295 }
17296 assert(ShiftAmt >= 0 && "All undef?");
17297
17298 // Great we found a shift right.
17299 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17300 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17301 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17302 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17303 DAG.getIntPtrConstant(0, DL));
17304}
17305
17306// Determine if this shuffle can be implemented with a KSHIFT instruction.
17307// Returns the shift amount if possible or -1 if not. This is a simplified
17308// version of matchShuffleAsShift.
17309static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17310 int MaskOffset, const APInt &Zeroable) {
17311 int Size = Mask.size();
17312
17313 auto CheckZeros = [&](int Shift, bool Left) {
17314 for (int j = 0; j < Shift; ++j)
17315 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17316 return false;
17317
17318 return true;
17319 };
17320
17321 auto MatchShift = [&](int Shift, bool Left) {
17322 unsigned Pos = Left ? Shift : 0;
17323 unsigned Low = Left ? 0 : Shift;
17324 unsigned Len = Size - Shift;
17325 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17326 };
17327
17328 for (int Shift = 1; Shift != Size; ++Shift)
17329 for (bool Left : {true, false})
17330 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17332 return Shift;
17333 }
17334
17335 return -1;
17336}
17337
17338
17339// Lower vXi1 vector shuffles.
17340// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17341// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17342// vector, shuffle and then truncate it back.
17344 MVT VT, SDValue V1, SDValue V2,
17345 const APInt &Zeroable,
17346 const X86Subtarget &Subtarget,
17347 SelectionDAG &DAG) {
17348 assert(Subtarget.hasAVX512() &&
17349 "Cannot lower 512-bit vectors w/o basic ISA!");
17350
17351 int NumElts = Mask.size();
17352 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17353
17354 // Try to recognize shuffles that are just padding a subvector with zeros.
17355 int SubvecElts = 0;
17356 int Src = -1;
17357 for (int i = 0; i != NumElts; ++i) {
17358 if (Mask[i] >= 0) {
17359 // Grab the source from the first valid mask. All subsequent elements need
17360 // to use this same source.
17361 if (Src < 0)
17362 Src = Mask[i] / NumElts;
17363 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17364 break;
17365 }
17366
17367 ++SubvecElts;
17368 }
17369 assert(SubvecElts != NumElts && "Identity shuffle?");
17370
17371 // Clip to a power 2.
17372 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17373
17374 // Make sure the number of zeroable bits in the top at least covers the bits
17375 // not covered by the subvector.
17376 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17377 assert(Src >= 0 && "Expected a source!");
17378 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17379 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
17380 Src == 0 ? V1 : V2,
17381 DAG.getIntPtrConstant(0, DL));
17382 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17383 DAG.getConstant(0, DL, VT),
17384 Extract, DAG.getIntPtrConstant(0, DL));
17385 }
17386
17387 // Try a simple shift right with undef elements. Later we'll try with zeros.
17388 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
17389 DAG))
17390 return Shift;
17391
17392 // Try to match KSHIFTs.
17393 unsigned Offset = 0;
17394 for (SDValue V : { V1, V2 }) {
17395 unsigned Opcode;
17396 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17397 if (ShiftAmt >= 0) {
17398 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17399 MVT WideVT = Res.getSimpleValueType();
17400 // Widened right shifts need two shifts to ensure we shift in zeroes.
17401 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17402 int WideElts = WideVT.getVectorNumElements();
17403 // Shift left to put the original vector in the MSBs of the new size.
17404 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17405 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17406 // Increase the shift amount to account for the left shift.
17407 ShiftAmt += WideElts - NumElts;
17408 }
17409
17410 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17411 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17412 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17413 DAG.getIntPtrConstant(0, DL));
17414 }
17415 Offset += NumElts; // Increment for next iteration.
17416 }
17417
17418 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
17419 // ops instead.
17420 // TODO: What other unary shuffles would benefit from this?
17421 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17422 SDValue Op0 = V1.getOperand(0);
17423 SDValue Op1 = V1.getOperand(1);
17424 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17425 EVT OpVT = Op0.getValueType();
17426 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17427 return DAG.getSetCC(
17428 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17429 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17430 }
17431
17432 MVT ExtVT;
17433 switch (VT.SimpleTy) {
17434 default:
17435 llvm_unreachable("Expected a vector of i1 elements");
17436 case MVT::v2i1:
17437 ExtVT = MVT::v2i64;
17438 break;
17439 case MVT::v4i1:
17440 ExtVT = MVT::v4i32;
17441 break;
17442 case MVT::v8i1:
17443 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17444 // shuffle.
17445 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17446 break;
17447 case MVT::v16i1:
17448 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17449 // 256-bit operation available.
17450 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17451 break;
17452 case MVT::v32i1:
17453 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17454 // 256-bit operation available.
17455 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17456 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17457 break;
17458 case MVT::v64i1:
17459 // Fall back to scalarization. FIXME: We can do better if the shuffle
17460 // can be partitioned cleanly.
17461 if (!Subtarget.useBWIRegs())
17462 return SDValue();
17463 ExtVT = MVT::v64i8;
17464 break;
17465 }
17466
17467 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17468 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17469
17470 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17471 // i1 was sign extended we can use X86ISD::CVT2MASK.
17472 int NumElems = VT.getVectorNumElements();
17473 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17474 (Subtarget.hasDQI() && (NumElems < 32)))
17475 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17476 Shuffle, ISD::SETGT);
17477
17478 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17479}
17480
17481/// Helper function that returns true if the shuffle mask should be
17482/// commuted to improve canonicalization.
17484 int NumElements = Mask.size();
17485
17486 int NumV1Elements = 0, NumV2Elements = 0;
17487 for (int M : Mask)
17488 if (M < 0)
17489 continue;
17490 else if (M < NumElements)
17491 ++NumV1Elements;
17492 else
17493 ++NumV2Elements;
17494
17495 // Commute the shuffle as needed such that more elements come from V1 than
17496 // V2. This allows us to match the shuffle pattern strictly on how many
17497 // elements come from V1 without handling the symmetric cases.
17498 if (NumV2Elements > NumV1Elements)
17499 return true;
17500
17501 assert(NumV1Elements > 0 && "No V1 indices");
17502
17503 if (NumV2Elements == 0)
17504 return false;
17505
17506 // When the number of V1 and V2 elements are the same, try to minimize the
17507 // number of uses of V2 in the low half of the vector. When that is tied,
17508 // ensure that the sum of indices for V1 is equal to or lower than the sum
17509 // indices for V2. When those are equal, try to ensure that the number of odd
17510 // indices for V1 is lower than the number of odd indices for V2.
17511 if (NumV1Elements == NumV2Elements) {
17512 int LowV1Elements = 0, LowV2Elements = 0;
17513 for (int M : Mask.slice(0, NumElements / 2))
17514 if (M >= NumElements)
17515 ++LowV2Elements;
17516 else if (M >= 0)
17517 ++LowV1Elements;
17518 if (LowV2Elements > LowV1Elements)
17519 return true;
17520 if (LowV2Elements == LowV1Elements) {
17521 int SumV1Indices = 0, SumV2Indices = 0;
17522 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17523 if (Mask[i] >= NumElements)
17524 SumV2Indices += i;
17525 else if (Mask[i] >= 0)
17526 SumV1Indices += i;
17527 if (SumV2Indices < SumV1Indices)
17528 return true;
17529 if (SumV2Indices == SumV1Indices) {
17530 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17531 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17532 if (Mask[i] >= NumElements)
17533 NumV2OddIndices += i % 2;
17534 else if (Mask[i] >= 0)
17535 NumV1OddIndices += i % 2;
17536 if (NumV2OddIndices < NumV1OddIndices)
17537 return true;
17538 }
17539 }
17540 }
17541
17542 return false;
17543}
17544
17546 const X86Subtarget &Subtarget) {
17547 if (!Subtarget.hasAVX512())
17548 return false;
17549
17550 if (!V.getValueType().isSimple())
17551 return false;
17552
17553 MVT VT = V.getSimpleValueType().getScalarType();
17554 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17555 return false;
17556
17557 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17558 // are preferable to blendw/blendvb/masked-mov.
17559 if ((VT == MVT::i16 || VT == MVT::i8) &&
17560 V.getSimpleValueType().getSizeInBits() < 512)
17561 return false;
17562
17563 auto HasMaskOperation = [&](SDValue V) {
17564 // TODO: Currently we only check limited opcode. We probably extend
17565 // it to all binary operation by checking TLI.isBinOp().
17566 switch (V->getOpcode()) {
17567 default:
17568 return false;
17569 case ISD::ADD:
17570 case ISD::SUB:
17571 case ISD::AND:
17572 case ISD::XOR:
17573 case ISD::OR:
17574 case ISD::SMAX:
17575 case ISD::SMIN:
17576 case ISD::UMAX:
17577 case ISD::UMIN:
17578 case ISD::ABS:
17579 case ISD::SHL:
17580 case ISD::SRL:
17581 case ISD::SRA:
17582 case ISD::MUL:
17583 break;
17584 }
17585 if (!V->hasOneUse())
17586 return false;
17587
17588 return true;
17589 };
17590
17591 if (HasMaskOperation(V))
17592 return true;
17593
17594 return false;
17595}
17596
17597// Forward declaration.
17600 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17601 const X86Subtarget &Subtarget);
17602
17603 /// Top-level lowering for x86 vector shuffles.
17604///
17605/// This handles decomposition, canonicalization, and lowering of all x86
17606/// vector shuffles. Most of the specific lowering strategies are encapsulated
17607/// above in helper routines. The canonicalization attempts to widen shuffles
17608/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17609/// s.t. only one of the two inputs needs to be tested, etc.
17611 SelectionDAG &DAG) {
17612 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17613 ArrayRef<int> OrigMask = SVOp->getMask();
17614 SDValue V1 = Op.getOperand(0);
17615 SDValue V2 = Op.getOperand(1);
17616 MVT VT = Op.getSimpleValueType();
17617 int NumElements = VT.getVectorNumElements();
17618 SDLoc DL(Op);
17619 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17620
17621 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17622 "Can't lower MMX shuffles");
17623
17624 bool V1IsUndef = V1.isUndef();
17625 bool V2IsUndef = V2.isUndef();
17626 if (V1IsUndef && V2IsUndef)
17627 return DAG.getUNDEF(VT);
17628
17629 // When we create a shuffle node we put the UNDEF node to second operand,
17630 // but in some cases the first operand may be transformed to UNDEF.
17631 // In this case we should just commute the node.
17632 if (V1IsUndef)
17633 return DAG.getCommutedVectorShuffle(*SVOp);
17634
17635 // Check for non-undef masks pointing at an undef vector and make the masks
17636 // undef as well. This makes it easier to match the shuffle based solely on
17637 // the mask.
17638 if (V2IsUndef &&
17639 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17640 SmallVector<int, 8> NewMask(OrigMask);
17641 for (int &M : NewMask)
17642 if (M >= NumElements)
17643 M = -1;
17644 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17645 }
17646
17647 // Check for illegal shuffle mask element index values.
17648 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17649 (void)MaskUpperLimit;
17650 assert(llvm::all_of(OrigMask,
17651 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17652 "Out of bounds shuffle index");
17653
17654 // We actually see shuffles that are entirely re-arrangements of a set of
17655 // zero inputs. This mostly happens while decomposing complex shuffles into
17656 // simple ones. Directly lower these as a buildvector of zeros.
17657 APInt KnownUndef, KnownZero;
17658 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17659
17660 APInt Zeroable = KnownUndef | KnownZero;
17661 if (Zeroable.isAllOnes())
17662 return getZeroVector(VT, Subtarget, DAG, DL);
17663
17664 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17665
17666 // Try to collapse shuffles into using a vector type with fewer elements but
17667 // wider element types. We cap this to not form integers or floating point
17668 // elements wider than 64 bits. It does not seem beneficial to form i128
17669 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17670 SmallVector<int, 16> WidenedMask;
17671 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17672 !canCombineAsMaskOperation(V1, Subtarget) &&
17673 !canCombineAsMaskOperation(V2, Subtarget) &&
17674 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17675 // Shuffle mask widening should not interfere with a broadcast opportunity
17676 // by obfuscating the operands with bitcasts.
17677 // TODO: Avoid lowering directly from this top-level function: make this
17678 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17679 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17680 Subtarget, DAG))
17681 return Broadcast;
17682
17683 MVT NewEltVT = VT.isFloatingPoint()
17686 int NewNumElts = NumElements / 2;
17687 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17688 // Make sure that the new vector type is legal. For example, v2f64 isn't
17689 // legal on SSE1.
17690 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17691 if (V2IsZero) {
17692 // Modify the new Mask to take all zeros from the all-zero vector.
17693 // Choose indices that are blend-friendly.
17694 bool UsedZeroVector = false;
17695 assert(is_contained(WidenedMask, SM_SentinelZero) &&
17696 "V2's non-undef elements are used?!");
17697 for (int i = 0; i != NewNumElts; ++i)
17698 if (WidenedMask[i] == SM_SentinelZero) {
17699 WidenedMask[i] = i + NewNumElts;
17700 UsedZeroVector = true;
17701 }
17702 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17703 // some elements to be undef.
17704 if (UsedZeroVector)
17705 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
17706 }
17707 V1 = DAG.getBitcast(NewVT, V1);
17708 V2 = DAG.getBitcast(NewVT, V2);
17709 return DAG.getBitcast(
17710 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
17711 }
17712 }
17713
17714 SmallVector<SDValue> Ops = {V1, V2};
17715 SmallVector<int> Mask(OrigMask);
17716
17717 // Canonicalize the shuffle with any horizontal ops inputs.
17718 // NOTE: This may update Ops and Mask.
17720 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
17721 return DAG.getBitcast(VT, HOp);
17722
17723 V1 = DAG.getBitcast(VT, Ops[0]);
17724 V2 = DAG.getBitcast(VT, Ops[1]);
17725 assert(NumElements == (int)Mask.size() &&
17726 "canonicalizeShuffleMaskWithHorizOp "
17727 "shouldn't alter the shuffle mask size");
17728
17729 // Commute the shuffle if it will improve canonicalization.
17732 std::swap(V1, V2);
17733 }
17734
17735 // For each vector width, delegate to a specialized lowering routine.
17736 if (VT.is128BitVector())
17737 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17738
17739 if (VT.is256BitVector())
17740 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17741
17742 if (VT.is512BitVector())
17743 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17744
17745 if (Is1BitVector)
17746 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17747
17748 llvm_unreachable("Unimplemented!");
17749}
17750
17751/// Try to lower a VSELECT instruction to a vector shuffle.
17753 const X86Subtarget &Subtarget,
17754 SelectionDAG &DAG) {
17755 SDValue Cond = Op.getOperand(0);
17756 SDValue LHS = Op.getOperand(1);
17757 SDValue RHS = Op.getOperand(2);
17758 MVT VT = Op.getSimpleValueType();
17759
17760 // Only non-legal VSELECTs reach this lowering, convert those into generic
17761 // shuffles and re-use the shuffle lowering path for blends.
17765 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
17766 }
17767
17768 return SDValue();
17769}
17770
17771SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
17772 SDValue Cond = Op.getOperand(0);
17773 SDValue LHS = Op.getOperand(1);
17774 SDValue RHS = Op.getOperand(2);
17775
17776 SDLoc dl(Op);
17777 MVT VT = Op.getSimpleValueType();
17778 if (isSoftF16(VT, Subtarget)) {
17780 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
17781 DAG.getBitcast(NVT, LHS),
17782 DAG.getBitcast(NVT, RHS)));
17783 }
17784
17785 // A vselect where all conditions and data are constants can be optimized into
17786 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
17790 return SDValue();
17791
17792 // Try to lower this to a blend-style vector shuffle. This can handle all
17793 // constant condition cases.
17794 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
17795 return BlendOp;
17796
17797 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
17798 // with patterns on the mask registers on AVX-512.
17799 MVT CondVT = Cond.getSimpleValueType();
17800 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
17801 if (CondEltSize == 1)
17802 return Op;
17803
17804 // Variable blends are only legal from SSE4.1 onward.
17805 if (!Subtarget.hasSSE41())
17806 return SDValue();
17807
17808 unsigned EltSize = VT.getScalarSizeInBits();
17809 unsigned NumElts = VT.getVectorNumElements();
17810
17811 // Expand v32i16/v64i8 without BWI.
17812 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
17813 return SDValue();
17814
17815 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17816 // into an i1 condition so that we can use the mask-based 512-bit blend
17817 // instructions.
17818 if (VT.getSizeInBits() == 512) {
17819 // Build a mask by testing the condition against zero.
17820 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
17821 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
17822 DAG.getConstant(0, dl, CondVT),
17823 ISD::SETNE);
17824 // Now return a new VSELECT using the mask.
17825 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
17826 }
17827
17828 // SEXT/TRUNC cases where the mask doesn't match the destination size.
17829 if (CondEltSize != EltSize) {
17830 // If we don't have a sign splat, rely on the expansion.
17831 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
17832 return SDValue();
17833
17834 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
17835 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
17836 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
17837 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
17838 }
17839
17840 // v16i16/v32i8 selects without AVX2, if the condition and another operand
17841 // are free to split, then better to split before expanding the
17842 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
17843 // TODO: This is very similar to narrowVectorSelect.
17844 // TODO: Add Load splitting to isFreeToSplitVector ?
17845 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
17846 !Subtarget.hasXOP()) {
17847 bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG);
17848 bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) ||
17849 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
17850 bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) ||
17851 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
17852 if (FreeCond && (FreeLHS || FreeRHS))
17853 return splitVectorOp(Op, DAG, dl);
17854 }
17855
17856 // Only some types will be legal on some subtargets. If we can emit a legal
17857 // VSELECT-matching blend, return Op, and but if we need to expand, return
17858 // a null value.
17859 switch (VT.SimpleTy) {
17860 default:
17861 // Most of the vector types have blends past SSE4.1.
17862 return Op;
17863
17864 case MVT::v32i8:
17865 // The byte blends for AVX vectors were introduced only in AVX2.
17866 if (Subtarget.hasAVX2())
17867 return Op;
17868
17869 return SDValue();
17870
17871 case MVT::v8i16:
17872 case MVT::v16i16: {
17873 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
17874 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
17875 Cond = DAG.getBitcast(CastVT, Cond);
17876 LHS = DAG.getBitcast(CastVT, LHS);
17877 RHS = DAG.getBitcast(CastVT, RHS);
17878 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
17879 return DAG.getBitcast(VT, Select);
17880 }
17881 }
17882}
17883
17885 MVT VT = Op.getSimpleValueType();
17886 SDValue Vec = Op.getOperand(0);
17887 SDValue Idx = Op.getOperand(1);
17888 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
17889 SDLoc dl(Op);
17890
17892 return SDValue();
17893
17894 if (VT.getSizeInBits() == 8) {
17895 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
17896 // we're going to zero extend the register or fold the store.
17899 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
17900 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17901 DAG.getBitcast(MVT::v4i32, Vec), Idx));
17902
17903 unsigned IdxVal = Idx->getAsZExtVal();
17904 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
17905 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17906 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17907 }
17908
17909 if (VT == MVT::f32) {
17910 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
17911 // the result back to FR32 register. It's only worth matching if the
17912 // result has a single use which is a store or a bitcast to i32. And in
17913 // the case of a store, it's not worth it if the index is a constant 0,
17914 // because a MOVSSmr can be used instead, which is smaller and faster.
17915 if (!Op.hasOneUse())
17916 return SDValue();
17917 SDNode *User = *Op.getNode()->use_begin();
17918 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
17919 (User->getOpcode() != ISD::BITCAST ||
17920 User->getValueType(0) != MVT::i32))
17921 return SDValue();
17922 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17923 DAG.getBitcast(MVT::v4i32, Vec), Idx);
17924 return DAG.getBitcast(MVT::f32, Extract);
17925 }
17926
17927 if (VT == MVT::i32 || VT == MVT::i64)
17928 return Op;
17929
17930 return SDValue();
17931}
17932
17933/// Extract one bit from mask vector, like v16i1 or v8i1.
17934/// AVX-512 feature.
17936 const X86Subtarget &Subtarget) {
17937 SDValue Vec = Op.getOperand(0);
17938 SDLoc dl(Vec);
17939 MVT VecVT = Vec.getSimpleValueType();
17940 SDValue Idx = Op.getOperand(1);
17941 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17942 MVT EltVT = Op.getSimpleValueType();
17943
17944 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
17945 "Unexpected vector type in ExtractBitFromMaskVector");
17946
17947 // variable index can't be handled in mask registers,
17948 // extend vector to VR512/128
17949 if (!IdxC) {
17950 unsigned NumElts = VecVT.getVectorNumElements();
17951 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17952 // than extending to 128/256bit.
17953 if (NumElts == 1) {
17954 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17956 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
17957 }
17958 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17959 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17960 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
17961 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
17962 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
17963 }
17964
17965 unsigned IdxVal = IdxC->getZExtValue();
17966 if (IdxVal == 0) // the operation is legal
17967 return Op;
17968
17969 // Extend to natively supported kshift.
17970 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17971
17972 // Use kshiftr instruction to move to the lower element.
17973 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
17974 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17975
17976 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17977 DAG.getIntPtrConstant(0, dl));
17978}
17979
17980// Helper to find all the extracted elements from a vector.
17982 MVT VT = N->getSimpleValueType(0);
17983 unsigned NumElts = VT.getVectorNumElements();
17984 APInt DemandedElts = APInt::getZero(NumElts);
17985 for (SDNode *User : N->uses()) {
17986 switch (User->getOpcode()) {
17987 case X86ISD::PEXTRB:
17988 case X86ISD::PEXTRW:
17990 if (!isa<ConstantSDNode>(User->getOperand(1))) {
17991 DemandedElts.setAllBits();
17992 return DemandedElts;
17993 }
17994 DemandedElts.setBit(User->getConstantOperandVal(1));
17995 break;
17996 case ISD::BITCAST: {
17997 if (!User->getValueType(0).isSimple() ||
17998 !User->getValueType(0).isVector()) {
17999 DemandedElts.setAllBits();
18000 return DemandedElts;
18001 }
18002 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18003 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18004 break;
18005 }
18006 default:
18007 DemandedElts.setAllBits();
18008 return DemandedElts;
18009 }
18010 }
18011 return DemandedElts;
18012}
18013
18014SDValue
18015X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18016 SelectionDAG &DAG) const {
18017 SDLoc dl(Op);
18018 SDValue Vec = Op.getOperand(0);
18019 MVT VecVT = Vec.getSimpleValueType();
18020 SDValue Idx = Op.getOperand(1);
18021 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18022
18023 if (VecVT.getVectorElementType() == MVT::i1)
18024 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18025
18026 if (!IdxC) {
18027 // Its more profitable to go through memory (1 cycles throughput)
18028 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18029 // IACA tool was used to get performance estimation
18030 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18031 //
18032 // example : extractelement <16 x i8> %a, i32 %i
18033 //
18034 // Block Throughput: 3.00 Cycles
18035 // Throughput Bottleneck: Port5
18036 //
18037 // | Num Of | Ports pressure in cycles | |
18038 // | Uops | 0 - DV | 5 | 6 | 7 | |
18039 // ---------------------------------------------
18040 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18041 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18042 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18043 // Total Num Of Uops: 4
18044 //
18045 //
18046 // Block Throughput: 1.00 Cycles
18047 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18048 //
18049 // | | Ports pressure in cycles | |
18050 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18051 // ---------------------------------------------------------
18052 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18053 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18054 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18055 // Total Num Of Uops: 4
18056
18057 return SDValue();
18058 }
18059
18060 unsigned IdxVal = IdxC->getZExtValue();
18061
18062 // If this is a 256-bit vector result, first extract the 128-bit vector and
18063 // then extract the element from the 128-bit vector.
18064 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18065 // Get the 128-bit vector.
18066 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18067 MVT EltVT = VecVT.getVectorElementType();
18068
18069 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18070 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18071
18072 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18073 // this can be done with a mask.
18074 IdxVal &= ElemsPerChunk - 1;
18075 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18076 DAG.getIntPtrConstant(IdxVal, dl));
18077 }
18078
18079 assert(VecVT.is128BitVector() && "Unexpected vector length");
18080
18081 MVT VT = Op.getSimpleValueType();
18082
18083 if (VT == MVT::i16) {
18084 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18085 // we're going to zero extend the register or fold the store (SSE41 only).
18086 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18087 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18088 if (Subtarget.hasFP16())
18089 return Op;
18090
18091 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18092 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18093 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18094 }
18095
18096 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18097 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18098 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18099 }
18100
18101 if (Subtarget.hasSSE41())
18102 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18103 return Res;
18104
18105 // Only extract a single element from a v16i8 source - determine the common
18106 // DWORD/WORD that all extractions share, and extract the sub-byte.
18107 // TODO: Add QWORD MOVQ extraction?
18108 if (VT == MVT::i8) {
18109 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18110 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18111
18112 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18113 int DWordIdx = IdxVal / 4;
18114 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18115 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18116 DAG.getBitcast(MVT::v4i32, Vec),
18117 DAG.getIntPtrConstant(DWordIdx, dl));
18118 int ShiftVal = (IdxVal % 4) * 8;
18119 if (ShiftVal != 0)
18120 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18121 DAG.getConstant(ShiftVal, dl, MVT::i8));
18122 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18123 }
18124
18125 int WordIdx = IdxVal / 2;
18126 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18127 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18128 DAG.getBitcast(MVT::v8i16, Vec),
18129 DAG.getIntPtrConstant(WordIdx, dl));
18130 int ShiftVal = (IdxVal % 2) * 8;
18131 if (ShiftVal != 0)
18132 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18133 DAG.getConstant(ShiftVal, dl, MVT::i8));
18134 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18135 }
18136 }
18137
18138 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18139 if (IdxVal == 0)
18140 return Op;
18141
18142 // Shuffle the element to the lowest element, then movss or movsh.
18144 Mask[0] = static_cast<int>(IdxVal);
18145 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18146 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18147 DAG.getIntPtrConstant(0, dl));
18148 }
18149
18150 if (VT.getSizeInBits() == 64) {
18151 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18152 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18153 // to match extract_elt for f64.
18154 if (IdxVal == 0)
18155 return Op;
18156
18157 // UNPCKHPD the element to the lowest double word, then movsd.
18158 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18159 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18160 int Mask[2] = { 1, -1 };
18161 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18162 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18163 DAG.getIntPtrConstant(0, dl));
18164 }
18165
18166 return SDValue();
18167}
18168
18169/// Insert one bit to mask vector, like v16i1 or v8i1.
18170/// AVX-512 feature.
18172 const X86Subtarget &Subtarget) {
18173 SDLoc dl(Op);
18174 SDValue Vec = Op.getOperand(0);
18175 SDValue Elt = Op.getOperand(1);
18176 SDValue Idx = Op.getOperand(2);
18177 MVT VecVT = Vec.getSimpleValueType();
18178
18179 if (!isa<ConstantSDNode>(Idx)) {
18180 // Non constant index. Extend source and destination,
18181 // insert element and then truncate the result.
18182 unsigned NumElts = VecVT.getVectorNumElements();
18183 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18184 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18185 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18186 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18187 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18188 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18189 }
18190
18191 // Copy into a k-register, extract to v1i1 and insert_subvector.
18192 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18193 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18194}
18195
18196SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18197 SelectionDAG &DAG) const {
18198 MVT VT = Op.getSimpleValueType();
18199 MVT EltVT = VT.getVectorElementType();
18200 unsigned NumElts = VT.getVectorNumElements();
18201 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18202
18203 if (EltVT == MVT::i1)
18204 return InsertBitToMaskVector(Op, DAG, Subtarget);
18205
18206 SDLoc dl(Op);
18207 SDValue N0 = Op.getOperand(0);
18208 SDValue N1 = Op.getOperand(1);
18209 SDValue N2 = Op.getOperand(2);
18210 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18211
18212 if (EltVT == MVT::bf16) {
18214 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18215 DAG.getBitcast(IVT, N0),
18216 DAG.getBitcast(MVT::i16, N1), N2);
18217 return DAG.getBitcast(VT, Res);
18218 }
18219
18220 if (!N2C) {
18221 // Variable insertion indices, usually we're better off spilling to stack,
18222 // but AVX512 can use a variable compare+select by comparing against all
18223 // possible vector indices, and FP insertion has less gpr->simd traffic.
18224 if (!(Subtarget.hasBWI() ||
18225 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18226 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18227 return SDValue();
18228
18229 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18230 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18231 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18232 return SDValue();
18233
18234 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18235 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18236 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18237
18238 SmallVector<SDValue, 16> RawIndices;
18239 for (unsigned I = 0; I != NumElts; ++I)
18240 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18241 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18242
18243 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18244 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18246 }
18247
18248 if (N2C->getAPIntValue().uge(NumElts))
18249 return SDValue();
18250 uint64_t IdxVal = N2C->getZExtValue();
18251
18252 bool IsZeroElt = X86::isZeroNode(N1);
18253 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18254
18255 if (IsZeroElt || IsAllOnesElt) {
18256 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18257 // We don't deal with i8 0 since it appears to be handled elsewhere.
18258 if (IsAllOnesElt &&
18259 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18260 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18261 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18262 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18263 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18264 CstVectorElts[IdxVal] = OnesCst;
18265 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18266 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18267 }
18268 // See if we can do this more efficiently with a blend shuffle with a
18269 // rematerializable vector.
18270 if (Subtarget.hasSSE41() &&
18271 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18272 SmallVector<int, 8> BlendMask;
18273 for (unsigned i = 0; i != NumElts; ++i)
18274 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18275 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18276 : getOnesVector(VT, DAG, dl);
18277 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18278 }
18279 }
18280
18281 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18282 // into that, and then insert the subvector back into the result.
18283 if (VT.is256BitVector() || VT.is512BitVector()) {
18284 // With a 256-bit vector, we can insert into the zero element efficiently
18285 // using a blend if we have AVX or AVX2 and the right data type.
18286 if (VT.is256BitVector() && IdxVal == 0) {
18287 // TODO: It is worthwhile to cast integer to floating point and back
18288 // and incur a domain crossing penalty if that's what we'll end up
18289 // doing anyway after extracting to a 128-bit vector.
18290 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18291 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18292 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18293 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18294 DAG.getTargetConstant(1, dl, MVT::i8));
18295 }
18296 }
18297
18298 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18299 assert(isPowerOf2_32(NumEltsIn128) &&
18300 "Vectors will always have power-of-two number of elements.");
18301
18302 // If we are not inserting into the low 128-bit vector chunk,
18303 // then prefer the broadcast+blend sequence.
18304 // FIXME: relax the profitability check iff all N1 uses are insertions.
18305 if (IdxVal >= NumEltsIn128 &&
18306 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18307 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18308 X86::mayFoldLoad(N1, Subtarget)))) {
18309 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18310 SmallVector<int, 8> BlendMask;
18311 for (unsigned i = 0; i != NumElts; ++i)
18312 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18313 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18314 }
18315
18316 // Get the desired 128-bit vector chunk.
18317 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18318
18319 // Insert the element into the desired chunk.
18320 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18321 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18322
18323 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18324 DAG.getIntPtrConstant(IdxIn128, dl));
18325
18326 // Insert the changed part back into the bigger vector
18327 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18328 }
18329 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18330
18331 // This will be just movw/movd/movq/movsh/movss/movsd.
18332 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18333 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18334 EltVT == MVT::f16 || EltVT == MVT::i64) {
18335 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18336 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18337 }
18338
18339 // We can't directly insert an i8 or i16 into a vector, so zero extend
18340 // it to i32 first.
18341 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18342 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18343 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18344 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18345 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18346 return DAG.getBitcast(VT, N1);
18347 }
18348 }
18349
18350 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18351 // argument. SSE41 required for pinsrb.
18352 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18353 unsigned Opc;
18354 if (VT == MVT::v8i16) {
18355 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18356 Opc = X86ISD::PINSRW;
18357 } else {
18358 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18359 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18360 Opc = X86ISD::PINSRB;
18361 }
18362
18363 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18364 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18365 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18366 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18367 }
18368
18369 if (Subtarget.hasSSE41()) {
18370 if (EltVT == MVT::f32) {
18371 // Bits [7:6] of the constant are the source select. This will always be
18372 // zero here. The DAG Combiner may combine an extract_elt index into
18373 // these bits. For example (insert (extract, 3), 2) could be matched by
18374 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18375 // Bits [5:4] of the constant are the destination select. This is the
18376 // value of the incoming immediate.
18377 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18378 // combine either bitwise AND or insert of float 0.0 to set these bits.
18379
18380 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18381 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18382 // If this is an insertion of 32-bits into the low 32-bits of
18383 // a vector, we prefer to generate a blend with immediate rather
18384 // than an insertps. Blends are simpler operations in hardware and so
18385 // will always have equal or better performance than insertps.
18386 // But if optimizing for size and there's a load folding opportunity,
18387 // generate insertps because blendps does not have a 32-bit memory
18388 // operand form.
18389 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18390 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18391 DAG.getTargetConstant(1, dl, MVT::i8));
18392 }
18393 // Create this as a scalar to vector..
18394 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18395 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18396 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18397 }
18398
18399 // PINSR* works with constant index.
18400 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18401 return Op;
18402 }
18403
18404 return SDValue();
18405}
18406
18408 SelectionDAG &DAG) {
18409 SDLoc dl(Op);
18410 MVT OpVT = Op.getSimpleValueType();
18411
18412 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18413 // combines.
18414 if (X86::isZeroNode(Op.getOperand(0)))
18415 return getZeroVector(OpVT, Subtarget, DAG, dl);
18416
18417 // If this is a 256-bit vector result, first insert into a 128-bit
18418 // vector and then insert into the 256-bit vector.
18419 if (!OpVT.is128BitVector()) {
18420 // Insert into a 128-bit vector.
18421 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18423 OpVT.getVectorNumElements() / SizeFactor);
18424
18425 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18426
18427 // Insert the 128-bit vector.
18428 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18429 }
18430 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18431 "Expected an SSE type!");
18432
18433 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18434 // tblgen.
18435 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18436 return Op;
18437
18438 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18439 return DAG.getBitcast(
18440 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18441}
18442
18443// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18444// simple superregister reference or explicit instructions to insert
18445// the upper bits of a vector.
18447 SelectionDAG &DAG) {
18448 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18449
18450 return insert1BitVector(Op, DAG, Subtarget);
18451}
18452
18454 SelectionDAG &DAG) {
18455 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18456 "Only vXi1 extract_subvectors need custom lowering");
18457
18458 SDLoc dl(Op);
18459 SDValue Vec = Op.getOperand(0);
18460 uint64_t IdxVal = Op.getConstantOperandVal(1);
18461
18462 if (IdxVal == 0) // the operation is legal
18463 return Op;
18464
18465 // Extend to natively supported kshift.
18466 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18467
18468 // Shift to the LSB.
18469 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18470 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18471
18472 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18473 DAG.getIntPtrConstant(0, dl));
18474}
18475
18476// Returns the appropriate wrapper opcode for a global reference.
18477unsigned X86TargetLowering::getGlobalWrapperKind(
18478 const GlobalValue *GV, const unsigned char OpFlags) const {
18479 // References to absolute symbols are never PC-relative.
18480 if (GV && GV->isAbsoluteSymbolRef())
18481 return X86ISD::Wrapper;
18482
18483 // The following OpFlags under RIP-rel PIC use RIP.
18484 if (Subtarget.isPICStyleRIPRel() &&
18485 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18486 OpFlags == X86II::MO_DLLIMPORT))
18487 return X86ISD::WrapperRIP;
18488
18489 // GOTPCREL references must always use RIP.
18490 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18491 return X86ISD::WrapperRIP;
18492
18493 return X86ISD::Wrapper;
18494}
18495
18496// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18497// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18498// one of the above mentioned nodes. It has to be wrapped because otherwise
18499// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18500// be used to form addressing mode. These wrapped nodes will be selected
18501// into MOV32ri.
18502SDValue
18503X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18504 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18505
18506 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18507 // global base reg.
18508 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18509
18510 auto PtrVT = getPointerTy(DAG.getDataLayout());
18512 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18513 SDLoc DL(CP);
18514 Result =
18515 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18516 // With PIC, the address is actually $g + Offset.
18517 if (OpFlag) {
18518 Result =
18519 DAG.getNode(ISD::ADD, DL, PtrVT,
18520 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18521 }
18522
18523 return Result;
18524}
18525
18526SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18527 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18528
18529 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18530 // global base reg.
18531 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18532
18533 auto PtrVT = getPointerTy(DAG.getDataLayout());
18534 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18535 SDLoc DL(JT);
18536 Result =
18537 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18538
18539 // With PIC, the address is actually $g + Offset.
18540 if (OpFlag)
18541 Result =
18542 DAG.getNode(ISD::ADD, DL, PtrVT,
18543 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18544
18545 return Result;
18546}
18547
18548SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18549 SelectionDAG &DAG) const {
18550 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18551}
18552
18553SDValue
18554X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18555 // Create the TargetBlockAddressAddress node.
18556 unsigned char OpFlags =
18558 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18559 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18560 SDLoc dl(Op);
18561 auto PtrVT = getPointerTy(DAG.getDataLayout());
18562 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18563 Result =
18564 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18565
18566 // With PIC, the address is actually $g + Offset.
18567 if (isGlobalRelativeToPICBase(OpFlags)) {
18568 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18569 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18570 }
18571
18572 return Result;
18573}
18574
18575/// Creates target global address or external symbol nodes for calls or
18576/// other uses.
18577SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18578 bool ForCall) const {
18579 // Unpack the global address or external symbol.
18580 SDLoc dl(Op);
18581 const GlobalValue *GV = nullptr;
18582 int64_t Offset = 0;
18583 const char *ExternalSym = nullptr;
18584 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18585 GV = G->getGlobal();
18586 Offset = G->getOffset();
18587 } else {
18588 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18589 ExternalSym = ES->getSymbol();
18590 }
18591
18592 // Calculate some flags for address lowering.
18594 unsigned char OpFlags;
18595 if (ForCall)
18596 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18597 else
18598 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18599 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18600 bool NeedsLoad = isGlobalStubReference(OpFlags);
18601
18603 auto PtrVT = getPointerTy(DAG.getDataLayout());
18605
18606 if (GV) {
18607 // Create a target global address if this is a global. If possible, fold the
18608 // offset into the global address reference. Otherwise, ADD it on later.
18609 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18610 // allowed because if the address of foo is 0, the ELF R_X86_64_32
18611 // relocation will compute to a negative value, which is invalid.
18612 int64_t GlobalOffset = 0;
18613 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18615 std::swap(GlobalOffset, Offset);
18616 }
18617 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18618 } else {
18619 // If this is not a global address, this must be an external symbol.
18620 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18621 }
18622
18623 // If this is a direct call, avoid the wrapper if we don't need to do any
18624 // loads or adds. This allows SDAG ISel to match direct calls.
18625 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
18626 return Result;
18627
18628 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
18629
18630 // With PIC, the address is actually $g + Offset.
18631 if (HasPICReg) {
18632 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18633 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18634 }
18635
18636 // For globals that require a load from a stub to get the address, emit the
18637 // load.
18638 if (NeedsLoad)
18639 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
18641
18642 // If there was a non-zero offset that we didn't fold, create an explicit
18643 // addition for it.
18644 if (Offset != 0)
18645 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
18646 DAG.getConstant(Offset, dl, PtrVT));
18647
18648 return Result;
18649}
18650
18651SDValue
18652X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
18653 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18654}
18655
18656static SDValue
18658 SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
18659 unsigned char OperandFlags, bool LocalDynamic = false) {
18661 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18662 SDLoc dl(GA);
18663 SDValue TGA;
18664 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
18665 if (LocalDynamic && UseTLSDESC) {
18666 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
18667 auto UI = TGA->use_begin();
18668 // Reuse existing GetTLSADDR node if we can find it.
18669 if (UI != TGA->use_end())
18670 return SDValue(*UI->use_begin()->use_begin(), 0);
18671 } else {
18672 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18673 GA->getOffset(), OperandFlags);
18674 }
18675
18676 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
18677 : LocalDynamic ? X86ISD::TLSBASEADDR
18679
18680 if (InGlue) {
18681 SDValue Ops[] = { Chain, TGA, *InGlue };
18682 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18683 } else {
18684 SDValue Ops[] = { Chain, TGA };
18685 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18686 }
18687
18688 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
18689 MFI.setAdjustsStack(true);
18690 MFI.setHasCalls(true);
18691
18692 SDValue Glue = Chain.getValue(1);
18693 SDValue Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
18694
18695 if (!UseTLSDESC)
18696 return Ret;
18697
18698 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
18699 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
18700
18702 SDValue Offset =
18703 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18705 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
18706}
18707
18708// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
18709static SDValue
18711 const EVT PtrVT) {
18712 SDValue InGlue;
18713 SDLoc dl(GA); // ? function entry point might be better
18714 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18716 SDLoc(), PtrVT), InGlue);
18717 InGlue = Chain.getValue(1);
18718
18719 return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
18720}
18721
18722// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
18723static SDValue
18725 const EVT PtrVT) {
18726 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18727 X86::RAX, X86II::MO_TLSGD);
18728}
18729
18730// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
18731static SDValue
18733 const EVT PtrVT) {
18734 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18735 X86::EAX, X86II::MO_TLSGD);
18736}
18737
18739 SelectionDAG &DAG, const EVT PtrVT,
18740 bool Is64Bit, bool Is64BitLP64) {
18741 SDLoc dl(GA);
18742
18743 // Get the start address of the TLS block for this module.
18747
18748 SDValue Base;
18749 if (Is64Bit) {
18750 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
18751 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
18752 X86II::MO_TLSLD, /*LocalDynamic=*/true);
18753 } else {
18754 SDValue InGlue;
18755 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18756 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
18757 InGlue = Chain.getValue(1);
18758 Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
18759 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
18760 }
18761
18762 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
18763 // of Base.
18764
18765 // Build x@dtpoff.
18766 unsigned char OperandFlags = X86II::MO_DTPOFF;
18767 unsigned WrapperKind = X86ISD::Wrapper;
18768 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18769 GA->getValueType(0),
18770 GA->getOffset(), OperandFlags);
18771 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18772
18773 // Add x@dtpoff with the base.
18774 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
18775}
18776
18777// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
18779 const EVT PtrVT, TLSModel::Model model,
18780 bool is64Bit, bool isPIC) {
18781 SDLoc dl(GA);
18782
18783 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18785 PointerType::get(*DAG.getContext(), is64Bit ? 257 : 256));
18786
18787 SDValue ThreadPointer =
18788 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18790
18791 unsigned char OperandFlags = 0;
18792 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
18793 // initialexec.
18794 unsigned WrapperKind = X86ISD::Wrapper;
18795 if (model == TLSModel::LocalExec) {
18796 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
18797 } else if (model == TLSModel::InitialExec) {
18798 if (is64Bit) {
18799 OperandFlags = X86II::MO_GOTTPOFF;
18800 WrapperKind = X86ISD::WrapperRIP;
18801 } else {
18802 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
18803 }
18804 } else {
18805 llvm_unreachable("Unexpected model");
18806 }
18807
18808 // emit "addl x@ntpoff,%eax" (local exec)
18809 // or "addl x@indntpoff,%eax" (initial exec)
18810 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18811 SDValue TGA =
18812 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18813 GA->getOffset(), OperandFlags);
18814 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18815
18816 if (model == TLSModel::InitialExec) {
18817 if (isPIC && !is64Bit) {
18818 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
18819 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18820 Offset);
18821 }
18822
18823 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
18825 }
18826
18827 // The address of the thread local variable is the add of the thread
18828 // pointer with the offset of the variable.
18829 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
18830}
18831
18832SDValue
18833X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
18834
18835 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
18836
18837 if (DAG.getTarget().useEmulatedTLS())
18838 return LowerToTLSEmulatedModel(GA, DAG);
18839
18840 const GlobalValue *GV = GA->getGlobal();
18841 auto PtrVT = getPointerTy(DAG.getDataLayout());
18842 bool PositionIndependent = isPositionIndependent();
18843
18844 if (Subtarget.isTargetELF()) {
18845 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
18846 switch (model) {
18848 if (Subtarget.is64Bit()) {
18849 if (Subtarget.isTarget64BitLP64())
18850 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
18851 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
18852 }
18853 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
18855 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
18856 Subtarget.isTarget64BitLP64());
18859 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
18860 PositionIndependent);
18861 }
18862 llvm_unreachable("Unknown TLS model.");
18863 }
18864
18865 if (Subtarget.isTargetDarwin()) {
18866 // Darwin only has one model of TLS. Lower to that.
18867 unsigned char OpFlag = 0;
18868 unsigned WrapperKind = 0;
18869
18870 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18871 // global base reg.
18872 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
18873 if (PIC32) {
18874 OpFlag = X86II::MO_TLVP_PIC_BASE;
18875 WrapperKind = X86ISD::Wrapper;
18876 } else {
18877 OpFlag = X86II::MO_TLVP;
18878 WrapperKind = X86ISD::WrapperRIP;
18879 }
18880 SDLoc DL(Op);
18882 GA->getValueType(0),
18883 GA->getOffset(), OpFlag);
18884 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
18885
18886 // With PIC32, the address is actually $g + Offset.
18887 if (PIC32)
18888 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
18889 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18890 Offset);
18891
18892 // Lowering the machine isd will make sure everything is in the right
18893 // location.
18894 SDValue Chain = DAG.getEntryNode();
18895 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18896 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
18897 SDValue Args[] = { Chain, Offset };
18898 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
18899 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
18900
18901 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
18903 MFI.setAdjustsStack(true);
18904
18905 // And our return value (tls address) is in the standard call return value
18906 // location.
18907 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
18908 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
18909 }
18910
18911 if (Subtarget.isOSWindows()) {
18912 // Just use the implicit TLS architecture
18913 // Need to generate something similar to:
18914 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
18915 // ; from TEB
18916 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
18917 // mov rcx, qword [rdx+rcx*8]
18918 // mov eax, .tls$:tlsvar
18919 // [rax+rcx] contains the address
18920 // Windows 64bit: gs:0x58
18921 // Windows 32bit: fs:__tls_array
18922
18923 SDLoc dl(GA);
18924 SDValue Chain = DAG.getEntryNode();
18925
18926 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18927 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18928 // use its literal value of 0x2C.
18930 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), 256)
18931 : PointerType::get(*DAG.getContext(), 257));
18932
18933 SDValue TlsArray = Subtarget.is64Bit()
18934 ? DAG.getIntPtrConstant(0x58, dl)
18935 : (Subtarget.isTargetWindowsGNU()
18936 ? DAG.getIntPtrConstant(0x2C, dl)
18937 : DAG.getExternalSymbol("_tls_array", PtrVT));
18938
18940 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
18941
18942 SDValue res;
18944 res = ThreadPointer;
18945 } else {
18946 // Load the _tls_index variable
18947 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
18948 if (Subtarget.is64Bit())
18949 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
18950 MachinePointerInfo(), MVT::i32);
18951 else
18952 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
18953
18954 const DataLayout &DL = DAG.getDataLayout();
18955 SDValue Scale =
18956 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
18957 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
18958
18959 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
18960 }
18961
18962 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
18963
18964 // Get the offset of start of .tls section
18965 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18966 GA->getValueType(0),
18968 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
18969
18970 // The address of the thread local variable is the add of the thread
18971 // pointer with the offset of the variable.
18972 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
18973 }
18974
18975 llvm_unreachable("TLS not implemented for this target.");
18976}
18977
18979 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
18981 TLSModel::Model Model = TM.getTLSModel(&GV);
18982 switch (Model) {
18985 // We can include the %fs segment register in addressing modes.
18986 return true;
18989 // These models do not result in %fs relative addresses unless
18990 // TLS descriptior are used.
18991 //
18992 // Even in the case of TLS descriptors we currently have no way to model
18993 // the difference between %fs access and the computations needed for the
18994 // offset and returning `true` for TLS-desc currently duplicates both
18995 // which is detrimental :-/
18996 return false;
18997 }
18998 }
18999 return false;
19000}
19001
19002/// Lower SRA_PARTS and friends, which return two i32 values
19003/// and take a 2 x i32 value to shift plus a shift amount.
19004/// TODO: Can this be moved to general expansion code?
19006 SDValue Lo, Hi;
19007 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19008 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19009}
19010
19011// Try to use a packed vector operation to handle i64 on 32-bit targets when
19012// AVX512DQ is enabled.
19014 SelectionDAG &DAG,
19015 const X86Subtarget &Subtarget) {
19016 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19017 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19018 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19019 Op.getOpcode() == ISD::UINT_TO_FP) &&
19020 "Unexpected opcode!");
19021 bool IsStrict = Op->isStrictFPOpcode();
19022 unsigned OpNo = IsStrict ? 1 : 0;
19023 SDValue Src = Op.getOperand(OpNo);
19024 MVT SrcVT = Src.getSimpleValueType();
19025 MVT VT = Op.getSimpleValueType();
19026
19027 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19028 (VT != MVT::f32 && VT != MVT::f64))
19029 return SDValue();
19030
19031 // Pack the i64 into a vector, do the operation and extract.
19032
19033 // Using 256-bit to ensure result is 128-bits for f32 case.
19034 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19035 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19036 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19037
19038 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19039 if (IsStrict) {
19040 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19041 {Op.getOperand(0), InVec});
19042 SDValue Chain = CvtVec.getValue(1);
19043 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19044 DAG.getIntPtrConstant(0, dl));
19045 return DAG.getMergeValues({Value, Chain}, dl);
19046 }
19047
19048 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19049
19050 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19051 DAG.getIntPtrConstant(0, dl));
19052}
19053
19054// Try to use a packed vector operation to handle i64 on 32-bit targets.
19056 const X86Subtarget &Subtarget) {
19057 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19058 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19059 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19060 Op.getOpcode() == ISD::UINT_TO_FP) &&
19061 "Unexpected opcode!");
19062 bool IsStrict = Op->isStrictFPOpcode();
19063 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19064 MVT SrcVT = Src.getSimpleValueType();
19065 MVT VT = Op.getSimpleValueType();
19066
19067 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19068 return SDValue();
19069
19070 // Pack the i64 into a vector, do the operation and extract.
19071
19072 assert(Subtarget.hasFP16() && "Expected FP16");
19073
19074 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19075 if (IsStrict) {
19076 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19077 {Op.getOperand(0), InVec});
19078 SDValue Chain = CvtVec.getValue(1);
19079 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19080 DAG.getIntPtrConstant(0, dl));
19081 return DAG.getMergeValues({Value, Chain}, dl);
19082 }
19083
19084 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19085
19086 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19087 DAG.getIntPtrConstant(0, dl));
19088}
19089
19090static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19091 const X86Subtarget &Subtarget) {
19092 switch (Opcode) {
19093 case ISD::SINT_TO_FP:
19094 // TODO: Handle wider types with AVX/AVX512.
19095 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19096 return false;
19097 // CVTDQ2PS or (V)CVTDQ2PD
19098 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19099
19100 case ISD::UINT_TO_FP:
19101 // TODO: Handle wider types and i64 elements.
19102 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19103 return false;
19104 // VCVTUDQ2PS or VCVTUDQ2PD
19105 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19106
19107 default:
19108 return false;
19109 }
19110}
19111
19112/// Given a scalar cast operation that is extracted from a vector, try to
19113/// vectorize the cast op followed by extraction. This will avoid an expensive
19114/// round-trip between XMM and GPR.
19116 SelectionDAG &DAG,
19117 const X86Subtarget &Subtarget) {
19118 // TODO: This could be enhanced to handle smaller integer types by peeking
19119 // through an extend.
19120 SDValue Extract = Cast.getOperand(0);
19121 MVT DestVT = Cast.getSimpleValueType();
19122 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19123 !isa<ConstantSDNode>(Extract.getOperand(1)))
19124 return SDValue();
19125
19126 // See if we have a 128-bit vector cast op for this type of cast.
19127 SDValue VecOp = Extract.getOperand(0);
19128 MVT FromVT = VecOp.getSimpleValueType();
19129 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19130 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19131 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19132 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19133 return SDValue();
19134
19135 // If we are extracting from a non-zero element, first shuffle the source
19136 // vector to allow extracting from element zero.
19137 if (!isNullConstant(Extract.getOperand(1))) {
19138 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19139 Mask[0] = Extract.getConstantOperandVal(1);
19140 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19141 }
19142 // If the source vector is wider than 128-bits, extract the low part. Do not
19143 // create an unnecessarily wide vector cast op.
19144 if (FromVT != Vec128VT)
19145 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19146
19147 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19148 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19149 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19150 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19151 DAG.getIntPtrConstant(0, DL));
19152}
19153
19154/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19155/// try to vectorize the cast ops. This will avoid an expensive round-trip
19156/// between XMM and GPR.
19157static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19158 SelectionDAG &DAG,
19159 const X86Subtarget &Subtarget) {
19160 // TODO: Allow FP_TO_UINT.
19161 SDValue CastToInt = CastToFP.getOperand(0);
19162 MVT VT = CastToFP.getSimpleValueType();
19163 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19164 return SDValue();
19165
19166 MVT IntVT = CastToInt.getSimpleValueType();
19167 SDValue X = CastToInt.getOperand(0);
19168 MVT SrcVT = X.getSimpleValueType();
19169 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19170 return SDValue();
19171
19172 // See if we have 128-bit vector cast instructions for this type of cast.
19173 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19174 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19175 IntVT != MVT::i32)
19176 return SDValue();
19177
19178 unsigned SrcSize = SrcVT.getSizeInBits();
19179 unsigned IntSize = IntVT.getSizeInBits();
19180 unsigned VTSize = VT.getSizeInBits();
19181 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19182 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19183 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19184
19185 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19186 unsigned ToIntOpcode =
19187 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19188 unsigned ToFPOpcode =
19189 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19190
19191 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19192 //
19193 // We are not defining the high elements (for example, zero them) because
19194 // that could nullify any performance advantage that we hoped to gain from
19195 // this vector op hack. We do not expect any adverse effects (like denorm
19196 // penalties) with cast ops.
19197 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19198 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19199 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19200 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19201 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19202}
19203
19205 SelectionDAG &DAG,
19206 const X86Subtarget &Subtarget) {
19207 bool IsStrict = Op->isStrictFPOpcode();
19208 MVT VT = Op->getSimpleValueType(0);
19209 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19210
19211 if (Subtarget.hasDQI()) {
19212 assert(!Subtarget.hasVLX() && "Unexpected features");
19213
19214 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19215 Src.getSimpleValueType() == MVT::v4i64) &&
19216 "Unsupported custom type");
19217
19218 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19219 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19220 "Unexpected VT!");
19221 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19222
19223 // Need to concat with zero vector for strict fp to avoid spurious
19224 // exceptions.
19225 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19226 : DAG.getUNDEF(MVT::v8i64);
19227 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19228 DAG.getIntPtrConstant(0, DL));
19229 SDValue Res, Chain;
19230 if (IsStrict) {
19231 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19232 {Op->getOperand(0), Src});
19233 Chain = Res.getValue(1);
19234 } else {
19235 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19236 }
19237
19238 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19239 DAG.getIntPtrConstant(0, DL));
19240
19241 if (IsStrict)
19242 return DAG.getMergeValues({Res, Chain}, DL);
19243 return Res;
19244 }
19245
19246 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19247 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19248 if (VT != MVT::v4f32 || IsSigned)
19249 return SDValue();
19250
19251 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19252 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19253 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19254 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19255 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19256 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19257 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19258 SmallVector<SDValue, 4> SignCvts(4);
19259 SmallVector<SDValue, 4> Chains(4);
19260 for (int i = 0; i != 4; ++i) {
19261 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19262 DAG.getIntPtrConstant(i, DL));
19263 if (IsStrict) {
19264 SignCvts[i] =
19265 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19266 {Op.getOperand(0), Elt});
19267 Chains[i] = SignCvts[i].getValue(1);
19268 } else {
19269 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19270 }
19271 }
19272 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19273
19274 SDValue Slow, Chain;
19275 if (IsStrict) {
19276 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19277 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19278 {Chain, SignCvt, SignCvt});
19279 Chain = Slow.getValue(1);
19280 } else {
19281 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19282 }
19283
19284 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19285 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19286
19287 if (IsStrict)
19288 return DAG.getMergeValues({Cvt, Chain}, DL);
19289
19290 return Cvt;
19291}
19292
19294 SelectionDAG &DAG) {
19295 bool IsStrict = Op->isStrictFPOpcode();
19296 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19297 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19298 MVT VT = Op.getSimpleValueType();
19299 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
19300
19301 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
19302 if (IsStrict)
19303 return DAG.getNode(
19304 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19305 {Chain,
19306 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19307 Rnd});
19308 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19309 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19310}
19311
19312static bool isLegalConversion(MVT VT, bool IsSigned,
19313 const X86Subtarget &Subtarget) {
19314 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19315 return true;
19316 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19317 return true;
19318 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19319 return true;
19320 if (Subtarget.useAVX512Regs()) {
19321 if (VT == MVT::v16i32)
19322 return true;
19323 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19324 return true;
19325 }
19326 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19327 (VT == MVT::v2i64 || VT == MVT::v4i64))
19328 return true;
19329 return false;
19330}
19331
19332SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19333 SelectionDAG &DAG) const {
19334 bool IsStrict = Op->isStrictFPOpcode();
19335 unsigned OpNo = IsStrict ? 1 : 0;
19336 SDValue Src = Op.getOperand(OpNo);
19337 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19338 MVT SrcVT = Src.getSimpleValueType();
19339 MVT VT = Op.getSimpleValueType();
19340 SDLoc dl(Op);
19341
19342 if (isSoftF16(VT, Subtarget))
19343 return promoteXINT_TO_FP(Op, dl, DAG);
19344 else if (isLegalConversion(SrcVT, true, Subtarget))
19345 return Op;
19346
19347 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19348 return LowerWin64_INT128_TO_FP(Op, DAG);
19349
19350 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19351 return Extract;
19352
19353 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
19354 return R;
19355
19356 if (SrcVT.isVector()) {
19357 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19358 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19359 // source for strict FP.
19360 if (IsStrict)
19361 return DAG.getNode(
19362 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19363 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19364 DAG.getUNDEF(SrcVT))});
19365 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19366 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19367 DAG.getUNDEF(SrcVT)));
19368 }
19369 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19370 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19371
19372 return SDValue();
19373 }
19374
19375 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19376 "Unknown SINT_TO_FP to lower!");
19377
19378 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19379
19380 // These are really Legal; return the operand so the caller accepts it as
19381 // Legal.
19382 if (SrcVT == MVT::i32 && UseSSEReg)
19383 return Op;
19384 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19385 return Op;
19386
19387 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19388 return V;
19389 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19390 return V;
19391
19392 // SSE doesn't have an i16 conversion so we need to promote.
19393 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19394 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19395 if (IsStrict)
19396 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19397 {Chain, Ext});
19398
19399 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19400 }
19401
19402 if (VT == MVT::f128 || !Subtarget.hasX87())
19403 return SDValue();
19404
19405 SDValue ValueToStore = Src;
19406 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19407 // Bitcasting to f64 here allows us to do a single 64-bit store from
19408 // an SSE register, avoiding the store forwarding penalty that would come
19409 // with two 32-bit stores.
19410 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19411
19412 unsigned Size = SrcVT.getStoreSize();
19413 Align Alignment(Size);
19415 auto PtrVT = getPointerTy(MF.getDataLayout());
19416 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19417 MachinePointerInfo MPI =
19419 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19420 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19421 std::pair<SDValue, SDValue> Tmp =
19422 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19423
19424 if (IsStrict)
19425 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19426
19427 return Tmp.first;
19428}
19429
19430std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19431 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19432 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19433 // Build the FILD
19434 SDVTList Tys;
19435 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19436 if (useSSE)
19437 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19438 else
19439 Tys = DAG.getVTList(DstVT, MVT::Other);
19440
19441 SDValue FILDOps[] = {Chain, Pointer};
19442 SDValue Result =
19443 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19444 Alignment, MachineMemOperand::MOLoad);
19445 Chain = Result.getValue(1);
19446
19447 if (useSSE) {
19449 unsigned SSFISize = DstVT.getStoreSize();
19450 int SSFI =
19451 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19452 auto PtrVT = getPointerTy(MF.getDataLayout());
19453 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19454 Tys = DAG.getVTList(MVT::Other);
19455 SDValue FSTOps[] = {Chain, Result, StackSlot};
19458 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19459
19460 Chain =
19461 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19462 Result = DAG.getLoad(
19463 DstVT, DL, Chain, StackSlot,
19465 Chain = Result.getValue(1);
19466 }
19467
19468 return { Result, Chain };
19469}
19470
19471/// Horizontal vector math instructions may be slower than normal math with
19472/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19473/// implementation, and likely shuffle complexity of the alternate sequence.
19474static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19475 const X86Subtarget &Subtarget) {
19476 bool IsOptimizingSize = DAG.shouldOptForSize();
19477 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19478 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19479}
19480
19481/// 64-bit unsigned integer to double expansion.
19483 SelectionDAG &DAG,
19484 const X86Subtarget &Subtarget) {
19485 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19486 // when converting 0 when rounding toward negative infinity. Caller will
19487 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19488 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19489 // This algorithm is not obvious. Here it is what we're trying to output:
19490 /*
19491 movq %rax, %xmm0
19492 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19493 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19494 #ifdef __SSE3__
19495 haddpd %xmm0, %xmm0
19496 #else
19497 pshufd $0x4e, %xmm0, %xmm1
19498 addpd %xmm1, %xmm0
19499 #endif
19500 */
19501
19502 LLVMContext *Context = DAG.getContext();
19503
19504 // Build some magic constants.
19505 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19506 Constant *C0 = ConstantDataVector::get(*Context, CV0);
19507 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19508 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19509
19511 CV1.push_back(
19512 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19513 APInt(64, 0x4330000000000000ULL))));
19514 CV1.push_back(
19515 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19516 APInt(64, 0x4530000000000000ULL))));
19517 Constant *C1 = ConstantVector::get(CV1);
19518 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19519
19520 // Load the 64-bit value into an XMM register.
19521 SDValue XR1 =
19522 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19523 SDValue CLod0 = DAG.getLoad(
19524 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19526 SDValue Unpck1 =
19527 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19528
19529 SDValue CLod1 = DAG.getLoad(
19530 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19532 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19533 // TODO: Are there any fast-math-flags to propagate here?
19534 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19535 SDValue Result;
19536
19537 if (Subtarget.hasSSE3() &&
19538 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19539 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19540 } else {
19541 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19542 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19543 }
19544 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19545 DAG.getIntPtrConstant(0, dl));
19546 return Result;
19547}
19548
19549/// 32-bit unsigned integer to float expansion.
19551 SelectionDAG &DAG,
19552 const X86Subtarget &Subtarget) {
19553 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19554 // FP constant to bias correct the final result.
19555 SDValue Bias = DAG.getConstantFP(
19556 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19557
19558 // Load the 32-bit value into an XMM register.
19559 SDValue Load =
19560 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19561
19562 // Zero out the upper parts of the register.
19563 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19564
19565 // Or the load with the bias.
19566 SDValue Or = DAG.getNode(
19567 ISD::OR, dl, MVT::v2i64,
19568 DAG.getBitcast(MVT::v2i64, Load),
19569 DAG.getBitcast(MVT::v2i64,
19570 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19571 Or =
19572 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19573 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
19574
19575 if (Op.getNode()->isStrictFPOpcode()) {
19576 // Subtract the bias.
19577 // TODO: Are there any fast-math-flags to propagate here?
19578 SDValue Chain = Op.getOperand(0);
19579 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19580 {Chain, Or, Bias});
19581
19582 if (Op.getValueType() == Sub.getValueType())
19583 return Sub;
19584
19585 // Handle final rounding.
19586 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19587 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19588
19589 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19590 }
19591
19592 // Subtract the bias.
19593 // TODO: Are there any fast-math-flags to propagate here?
19594 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19595
19596 // Handle final rounding.
19597 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19598}
19599
19601 SelectionDAG &DAG,
19602 const X86Subtarget &Subtarget) {
19603 if (Op.getSimpleValueType() != MVT::v2f64)
19604 return SDValue();
19605
19606 bool IsStrict = Op->isStrictFPOpcode();
19607
19608 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19609 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19610
19611 if (Subtarget.hasAVX512()) {
19612 if (!Subtarget.hasVLX()) {
19613 // Let generic type legalization widen this.
19614 if (!IsStrict)
19615 return SDValue();
19616 // Otherwise pad the integer input with 0s and widen the operation.
19617 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19618 DAG.getConstant(0, DL, MVT::v2i32));
19619 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19620 {Op.getOperand(0), N0});
19621 SDValue Chain = Res.getValue(1);
19622 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
19623 DAG.getIntPtrConstant(0, DL));
19624 return DAG.getMergeValues({Res, Chain}, DL);
19625 }
19626
19627 // Legalize to v4i32 type.
19628 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19629 DAG.getUNDEF(MVT::v2i32));
19630 if (IsStrict)
19631 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
19632 {Op.getOperand(0), N0});
19633 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
19634 }
19635
19636 // Zero extend to 2i64, OR with the floating point representation of 2^52.
19637 // This gives us the floating point equivalent of 2^52 + the i32 integer
19638 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
19639 // point leaving just our i32 integers in double format.
19640 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
19641 SDValue VBias = DAG.getConstantFP(
19642 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
19643 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
19644 DAG.getBitcast(MVT::v2i64, VBias));
19645 Or = DAG.getBitcast(MVT::v2f64, Or);
19646
19647 if (IsStrict)
19648 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
19649 {Op.getOperand(0), Or, VBias});
19650 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
19651}
19652
19654 SelectionDAG &DAG,
19655 const X86Subtarget &Subtarget) {
19656 bool IsStrict = Op->isStrictFPOpcode();
19657 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
19658 MVT VecIntVT = V.getSimpleValueType();
19659 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
19660 "Unsupported custom type");
19661
19662 if (Subtarget.hasAVX512()) {
19663 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19664 assert(!Subtarget.hasVLX() && "Unexpected features");
19665 MVT VT = Op->getSimpleValueType(0);
19666
19667 // v8i32->v8f64 is legal with AVX512 so just return it.
19668 if (VT == MVT::v8f64)
19669 return Op;
19670
19671 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
19672 "Unexpected VT!");
19673 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
19674 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
19675 // Need to concat with zero vector for strict fp to avoid spurious
19676 // exceptions.
19677 SDValue Tmp =
19678 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
19679 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
19680 DAG.getIntPtrConstant(0, DL));
19681 SDValue Res, Chain;
19682 if (IsStrict) {
19683 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
19684 {Op->getOperand(0), V});
19685 Chain = Res.getValue(1);
19686 } else {
19687 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
19688 }
19689
19690 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19691 DAG.getIntPtrConstant(0, DL));
19692
19693 if (IsStrict)
19694 return DAG.getMergeValues({Res, Chain}, DL);
19695 return Res;
19696 }
19697
19698 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
19699 Op->getSimpleValueType(0) == MVT::v4f64) {
19700 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
19701 Constant *Bias = ConstantFP::get(
19702 *DAG.getContext(),
19703 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
19704 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19705 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
19706 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
19707 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
19708 SDValue VBias = DAG.getMemIntrinsicNode(
19709 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
19712
19713 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
19714 DAG.getBitcast(MVT::v4i64, VBias));
19715 Or = DAG.getBitcast(MVT::v4f64, Or);
19716
19717 if (IsStrict)
19718 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
19719 {Op.getOperand(0), Or, VBias});
19720 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
19721 }
19722
19723 // The algorithm is the following:
19724 // #ifdef __SSE4_1__
19725 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19726 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19727 // (uint4) 0x53000000, 0xaa);
19728 // #else
19729 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19730 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19731 // #endif
19732 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19733 // return (float4) lo + fhi;
19734
19735 bool Is128 = VecIntVT == MVT::v4i32;
19736 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
19737 // If we convert to something else than the supported type, e.g., to v4f64,
19738 // abort early.
19739 if (VecFloatVT != Op->getSimpleValueType(0))
19740 return SDValue();
19741
19742 // In the #idef/#else code, we have in common:
19743 // - The vector of constants:
19744 // -- 0x4b000000
19745 // -- 0x53000000
19746 // - A shift:
19747 // -- v >> 16
19748
19749 // Create the splat vector for 0x4b000000.
19750 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
19751 // Create the splat vector for 0x53000000.
19752 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
19753
19754 // Create the right shift.
19755 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
19756 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
19757
19758 SDValue Low, High;
19759 if (Subtarget.hasSSE41()) {
19760 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
19761 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19762 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
19763 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
19764 // Low will be bitcasted right away, so do not bother bitcasting back to its
19765 // original type.
19766 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
19767 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19768 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19769 // (uint4) 0x53000000, 0xaa);
19770 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
19771 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
19772 // High will be bitcasted right away, so do not bother bitcasting back to
19773 // its original type.
19774 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
19775 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19776 } else {
19777 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
19778 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19779 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
19780 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
19781
19782 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19783 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
19784 }
19785
19786 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
19787 SDValue VecCstFSub = DAG.getConstantFP(
19788 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
19789
19790 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19791 // NOTE: By using fsub of a positive constant instead of fadd of a negative
19792 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
19793 // enabled. See PR24512.
19794 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
19795 // TODO: Are there any fast-math-flags to propagate here?
19796 // (float4) lo;
19797 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
19798 // return (float4) lo + fhi;
19799 if (IsStrict) {
19800 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
19801 {Op.getOperand(0), HighBitcast, VecCstFSub});
19802 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
19803 {FHigh.getValue(1), LowBitcast, FHigh});
19804 }
19805
19806 SDValue FHigh =
19807 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
19808 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
19809}
19810
19812 const X86Subtarget &Subtarget) {
19813 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19814 SDValue N0 = Op.getOperand(OpNo);
19815 MVT SrcVT = N0.getSimpleValueType();
19816
19817 switch (SrcVT.SimpleTy) {
19818 default:
19819 llvm_unreachable("Custom UINT_TO_FP is not supported!");
19820 case MVT::v2i32:
19821 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
19822 case MVT::v4i32:
19823 case MVT::v8i32:
19824 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
19825 case MVT::v2i64:
19826 case MVT::v4i64:
19827 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19828 }
19829}
19830
19831SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
19832 SelectionDAG &DAG) const {
19833 bool IsStrict = Op->isStrictFPOpcode();
19834 unsigned OpNo = IsStrict ? 1 : 0;
19835 SDValue Src = Op.getOperand(OpNo);
19836 SDLoc dl(Op);
19837 auto PtrVT = getPointerTy(DAG.getDataLayout());
19838 MVT SrcVT = Src.getSimpleValueType();
19839 MVT DstVT = Op->getSimpleValueType(0);
19840 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19841
19842 // Bail out when we don't have native conversion instructions.
19843 if (DstVT == MVT::f128)
19844 return SDValue();
19845
19846 if (isSoftF16(DstVT, Subtarget))
19847 return promoteXINT_TO_FP(Op, dl, DAG);
19848 else if (isLegalConversion(SrcVT, false, Subtarget))
19849 return Op;
19850
19851 if (DstVT.isVector())
19852 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
19853
19854 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19855 return LowerWin64_INT128_TO_FP(Op, DAG);
19856
19857 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19858 return Extract;
19859
19860 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
19861 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
19862 // Conversions from unsigned i32 to f32/f64 are legal,
19863 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
19864 return Op;
19865 }
19866
19867 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
19868 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
19869 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
19870 if (IsStrict)
19871 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
19872 {Chain, Src});
19873 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
19874 }
19875
19876 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19877 return V;
19878 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19879 return V;
19880
19881 // The transform for i64->f64 isn't correct for 0 when rounding to negative
19882 // infinity. It produces -0.0, so disable under strictfp.
19883 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
19884 !IsStrict)
19885 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
19886 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
19887 // negative infinity. So disable under strictfp. Using FILD instead.
19888 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
19889 !IsStrict)
19890 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
19891 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
19892 (DstVT == MVT::f32 || DstVT == MVT::f64))
19893 return SDValue();
19894
19895 // Make a 64-bit buffer, and use it to build an FILD.
19896 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
19897 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
19898 Align SlotAlign(8);
19899 MachinePointerInfo MPI =
19901 if (SrcVT == MVT::i32) {
19902 SDValue OffsetSlot =
19903 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
19904 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
19905 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
19906 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
19907 std::pair<SDValue, SDValue> Tmp =
19908 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
19909 if (IsStrict)
19910 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19911
19912 return Tmp.first;
19913 }
19914
19915 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
19916 SDValue ValueToStore = Src;
19917 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
19918 // Bitcasting to f64 here allows us to do a single 64-bit store from
19919 // an SSE register, avoiding the store forwarding penalty that would come
19920 // with two 32-bit stores.
19921 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19922 }
19923 SDValue Store =
19924 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
19925 // For i64 source, we need to add the appropriate power of 2 if the input
19926 // was negative. We must be careful to do the computation in x87 extended
19927 // precision, not in SSE.
19928 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19929 SDValue Ops[] = {Store, StackSlot};
19930 SDValue Fild =
19931 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
19932 SlotAlign, MachineMemOperand::MOLoad);
19933 Chain = Fild.getValue(1);
19934
19935 // Check whether the sign bit is set.
19936 SDValue SignSet = DAG.getSetCC(
19937 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
19938 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
19939
19940 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
19941 APInt FF(64, 0x5F80000000000000ULL);
19942 SDValue FudgePtr =
19943 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
19944 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
19945
19946 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
19947 SDValue Zero = DAG.getIntPtrConstant(0, dl);
19948 SDValue Four = DAG.getIntPtrConstant(4, dl);
19949 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
19950 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
19951
19952 // Load the value out, extending it from f32 to f80.
19953 SDValue Fudge = DAG.getExtLoad(
19954 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
19956 CPAlignment);
19957 Chain = Fudge.getValue(1);
19958 // Extend everything to 80 bits to force it to be done on x87.
19959 // TODO: Are there any fast-math-flags to propagate here?
19960 if (IsStrict) {
19961 unsigned Opc = ISD::STRICT_FADD;
19962 // Windows needs the precision control changed to 80bits around this add.
19963 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19965
19966 SDValue Add =
19967 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
19968 // STRICT_FP_ROUND can't handle equal types.
19969 if (DstVT == MVT::f80)
19970 return Add;
19971 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
19972 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
19973 }
19974 unsigned Opc = ISD::FADD;
19975 // Windows needs the precision control changed to 80bits around this add.
19976 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19977 Opc = X86ISD::FP80_ADD;
19978
19979 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
19980 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
19981 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
19982}
19983
19984// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
19985// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
19986// just return an SDValue().
19987// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
19988// to i16, i32 or i64, and we lower it to a legal sequence and return the
19989// result.
19990SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
19991 bool IsSigned,
19992 SDValue &Chain) const {
19993 bool IsStrict = Op->isStrictFPOpcode();
19994 SDLoc DL(Op);
19995
19996 EVT DstTy = Op.getValueType();
19997 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
19998 EVT TheVT = Value.getValueType();
19999 auto PtrVT = getPointerTy(DAG.getDataLayout());
20000
20001 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20002 // f16 must be promoted before using the lowering in this routine.
20003 // fp128 does not use this lowering.
20004 return SDValue();
20005 }
20006
20007 // If using FIST to compute an unsigned i64, we'll need some fixup
20008 // to handle values above the maximum signed i64. A FIST is always
20009 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20010 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20011
20012 // FIXME: This does not generate an invalid exception if the input does not
20013 // fit in i32. PR44019
20014 if (!IsSigned && DstTy != MVT::i64) {
20015 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20016 // The low 32 bits of the fist result will have the correct uint32 result.
20017 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20018 DstTy = MVT::i64;
20019 }
20020
20021 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20022 DstTy.getSimpleVT() >= MVT::i16 &&
20023 "Unknown FP_TO_INT to lower!");
20024
20025 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20026 // stack slot.
20028 unsigned MemSize = DstTy.getStoreSize();
20029 int SSFI =
20030 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20031 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20032
20033 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20034
20035 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20036
20037 if (UnsignedFixup) {
20038 //
20039 // Conversion to unsigned i64 is implemented with a select,
20040 // depending on whether the source value fits in the range
20041 // of a signed i64. Let Thresh be the FP equivalent of
20042 // 0x8000000000000000ULL.
20043 //
20044 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20045 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20046 // FistSrc = (Value - FltOfs);
20047 // Fist-to-mem64 FistSrc
20048 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20049 // to XOR'ing the high 32 bits with Adjust.
20050 //
20051 // Being a power of 2, Thresh is exactly representable in all FP formats.
20052 // For X87 we'd like to use the smallest FP type for this constant, but
20053 // for DAG type consistency we have to match the FP operand type.
20054
20055 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20057 bool LosesInfo = false;
20058 if (TheVT == MVT::f64)
20059 // The rounding mode is irrelevant as the conversion should be exact.
20061 &LosesInfo);
20062 else if (TheVT == MVT::f80)
20063 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20064 APFloat::rmNearestTiesToEven, &LosesInfo);
20065
20066 assert(Status == APFloat::opOK && !LosesInfo &&
20067 "FP conversion should have been exact");
20068
20069 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20070
20071 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20072 *DAG.getContext(), TheVT);
20073 SDValue Cmp;
20074 if (IsStrict) {
20075 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20076 /*IsSignaling*/ true);
20077 Chain = Cmp.getValue(1);
20078 } else {
20079 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20080 }
20081
20082 // Our preferred lowering of
20083 //
20084 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20085 //
20086 // is
20087 //
20088 // (Value >= Thresh) << 63
20089 //
20090 // but since we can get here after LegalOperations, DAGCombine might do the
20091 // wrong thing if we create a select. So, directly create the preferred
20092 // version.
20093 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20094 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20095 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20096
20097 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20098 DAG.getConstantFP(0.0, DL, TheVT));
20099
20100 if (IsStrict) {
20101 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20102 { Chain, Value, FltOfs });
20103 Chain = Value.getValue(1);
20104 } else
20105 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20106 }
20107
20109
20110 // FIXME This causes a redundant load/store if the SSE-class value is already
20111 // in memory, such as if it is on the callstack.
20112 if (isScalarFPTypeInSSEReg(TheVT)) {
20113 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20114 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20115 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20116 SDValue Ops[] = { Chain, StackSlot };
20117
20118 unsigned FLDSize = TheVT.getStoreSize();
20119 assert(FLDSize <= MemSize && "Stack slot not big enough");
20121 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20122 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20123 Chain = Value.getValue(1);
20124 }
20125
20126 // Build the FP_TO_INT*_IN_MEM
20128 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20129 SDValue Ops[] = { Chain, Value, StackSlot };
20131 DAG.getVTList(MVT::Other),
20132 Ops, DstTy, MMO);
20133
20134 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20135 Chain = Res.getValue(1);
20136
20137 // If we need an unsigned fixup, XOR the result with adjust.
20138 if (UnsignedFixup)
20139 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20140
20141 return Res;
20142}
20143
20145 const X86Subtarget &Subtarget) {
20146 MVT VT = Op.getSimpleValueType();
20147 SDValue In = Op.getOperand(0);
20148 MVT InVT = In.getSimpleValueType();
20149 unsigned Opc = Op.getOpcode();
20150
20151 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20152 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20153 "Unexpected extension opcode");
20155 "Expected same number of elements");
20156 assert((VT.getVectorElementType() == MVT::i16 ||
20157 VT.getVectorElementType() == MVT::i32 ||
20158 VT.getVectorElementType() == MVT::i64) &&
20159 "Unexpected element type");
20160 assert((InVT.getVectorElementType() == MVT::i8 ||
20161 InVT.getVectorElementType() == MVT::i16 ||
20162 InVT.getVectorElementType() == MVT::i32) &&
20163 "Unexpected element type");
20164
20165 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20166
20167 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20168 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20169 return splitVectorIntUnary(Op, DAG, dl);
20170 }
20171
20172 if (Subtarget.hasInt256())
20173 return Op;
20174
20175 // Optimize vectors in AVX mode:
20176 //
20177 // v8i16 -> v8i32
20178 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20179 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20180 // Concat upper and lower parts.
20181 //
20182 // v4i32 -> v4i64
20183 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20184 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20185 // Concat upper and lower parts.
20186 //
20187 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20188 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20189
20190 // Short-circuit if we can determine that each 128-bit half is the same value.
20191 // Otherwise, this is difficult to match and optimize.
20192 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20193 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20194 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20195
20196 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20197 SDValue Undef = DAG.getUNDEF(InVT);
20198 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20199 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20200 OpHi = DAG.getBitcast(HalfVT, OpHi);
20201
20202 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20203}
20204
20205// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20206static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20207 const SDLoc &dl, SelectionDAG &DAG) {
20208 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20209 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20210 DAG.getIntPtrConstant(0, dl));
20211 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20212 DAG.getIntPtrConstant(8, dl));
20213 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20214 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20215 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20216 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20217}
20218
20220 const X86Subtarget &Subtarget,
20221 SelectionDAG &DAG) {
20222 MVT VT = Op->getSimpleValueType(0);
20223 SDValue In = Op->getOperand(0);
20224 MVT InVT = In.getSimpleValueType();
20225 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20226 unsigned NumElts = VT.getVectorNumElements();
20227
20228 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20229 // avoids a constant pool load.
20230 if (VT.getVectorElementType() != MVT::i8) {
20231 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20232 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20233 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20234 }
20235
20236 // Extend VT if BWI is not supported.
20237 MVT ExtVT = VT;
20238 if (!Subtarget.hasBWI()) {
20239 // If v16i32 is to be avoided, we'll need to split and concatenate.
20240 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20241 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20242
20243 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20244 }
20245
20246 // Widen to 512-bits if VLX is not supported.
20247 MVT WideVT = ExtVT;
20248 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20249 NumElts *= 512 / ExtVT.getSizeInBits();
20250 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20251 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20252 In, DAG.getIntPtrConstant(0, DL));
20253 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20254 NumElts);
20255 }
20256
20257 SDValue One = DAG.getConstant(1, DL, WideVT);
20258 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20259
20260 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20261
20262 // Truncate if we had to extend above.
20263 if (VT != ExtVT) {
20264 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20265 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20266 }
20267
20268 // Extract back to 128/256-bit if we widened.
20269 if (WideVT != VT)
20270 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20271 DAG.getIntPtrConstant(0, DL));
20272
20273 return SelectedVal;
20274}
20275
20277 SelectionDAG &DAG) {
20278 SDValue In = Op.getOperand(0);
20279 MVT SVT = In.getSimpleValueType();
20280 SDLoc DL(Op);
20281
20282 if (SVT.getVectorElementType() == MVT::i1)
20283 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
20284
20285 assert(Subtarget.hasAVX() && "Expected AVX support");
20286 return LowerAVXExtend(Op, DL, DAG, Subtarget);
20287}
20288
20289/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20290/// It makes use of the fact that vectors with enough leading sign/zero bits
20291/// prevent the PACKSS/PACKUS from saturating the results.
20292/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20293/// within each 128-bit lane.
20294static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20295 const SDLoc &DL, SelectionDAG &DAG,
20296 const X86Subtarget &Subtarget) {
20297 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20298 "Unexpected PACK opcode");
20299 assert(DstVT.isVector() && "VT not a vector?");
20300
20301 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20302 if (!Subtarget.hasSSE2())
20303 return SDValue();
20304
20305 EVT SrcVT = In.getValueType();
20306
20307 // No truncation required, we might get here due to recursive calls.
20308 if (SrcVT == DstVT)
20309 return In;
20310
20311 unsigned NumElems = SrcVT.getVectorNumElements();
20312 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20313 return SDValue();
20314
20315 unsigned DstSizeInBits = DstVT.getSizeInBits();
20316 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20317 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20318 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20319
20320 LLVMContext &Ctx = *DAG.getContext();
20321 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20322 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20323
20324 // Pack to the largest type possible:
20325 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20326 EVT InVT = MVT::i16, OutVT = MVT::i8;
20327 if (SrcVT.getScalarSizeInBits() > 16 &&
20328 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20329 InVT = MVT::i32;
20330 OutVT = MVT::i16;
20331 }
20332
20333 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20334 // On pre-AVX512, pack the src in both halves to help value tracking.
20335 if (SrcSizeInBits <= 128) {
20336 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20337 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20338 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20339 SDValue LHS = DAG.getBitcast(InVT, In);
20340 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20341 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20342 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20343 Res = DAG.getBitcast(PackedVT, Res);
20344 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20345 }
20346
20347 // Split lower/upper subvectors.
20348 SDValue Lo, Hi;
20349 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20350
20351 // If Hi is undef, then don't bother packing it and widen the result instead.
20352 if (Hi.isUndef()) {
20353 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20354 if (SDValue Res =
20355 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20356 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20357 }
20358
20359 unsigned SubSizeInBits = SrcSizeInBits / 2;
20360 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20361 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20362
20363 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20364 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20365 Lo = DAG.getBitcast(InVT, Lo);
20366 Hi = DAG.getBitcast(InVT, Hi);
20367 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20368 return DAG.getBitcast(DstVT, Res);
20369 }
20370
20371 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20372 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20373 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20374 Lo = DAG.getBitcast(InVT, Lo);
20375 Hi = DAG.getBitcast(InVT, Hi);
20376 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20377
20378 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20379 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20380 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20382 int Scale = 64 / OutVT.getScalarSizeInBits();
20383 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20384 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20385
20386 if (DstVT.is256BitVector())
20387 return DAG.getBitcast(DstVT, Res);
20388
20389 // If 512bit -> 128bit truncate another stage.
20390 Res = DAG.getBitcast(PackedVT, Res);
20391 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20392 }
20393
20394 // Recursively pack lower/upper subvectors, concat result and pack again.
20395 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20396
20397 if (PackedVT.is128BitVector()) {
20398 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20399 // type legalization.
20400 SDValue Res =
20401 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20402 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20403 }
20404
20405 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20406 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20407 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20408 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20409 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20410}
20411
20412/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20413/// e.g. trunc <8 x i32> X to <8 x i16> -->
20414/// MaskX = X & 0xffff (clear high bits to prevent saturation)
20415/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20417 const X86Subtarget &Subtarget,
20418 SelectionDAG &DAG) {
20419 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20420 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20421}
20422
20423/// Truncate using inreg sign extension and X86ISD::PACKSS.
20425 const X86Subtarget &Subtarget,
20426 SelectionDAG &DAG) {
20427 EVT SrcVT = In.getValueType();
20428 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20429 DAG.getValueType(DstVT));
20430 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20431}
20432
20433/// Helper to determine if \p In truncated to \p DstVT has the necessary
20434/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20435/// possibly by converting a SRL node to SRA for sign extension.
20436static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20437 SDValue In, const SDLoc &DL,
20438 SelectionDAG &DAG,
20439 const X86Subtarget &Subtarget) {
20440 // Requires SSE2.
20441 if (!Subtarget.hasSSE2())
20442 return SDValue();
20443
20444 EVT SrcVT = In.getValueType();
20445 EVT DstSVT = DstVT.getVectorElementType();
20446 EVT SrcSVT = SrcVT.getVectorElementType();
20447 unsigned NumDstEltBits = DstSVT.getSizeInBits();
20448 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
20449
20450 // Check we have a truncation suited for PACKSS/PACKUS.
20451 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20452 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20453 return SDValue();
20454
20455 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
20456 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
20457
20458 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20459 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20460 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20461 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20462 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20463 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20464 return SDValue();
20465
20466 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20467 // split this for packing.
20468 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20469 !isFreeToSplitVector(In.getNode(), DAG) &&
20470 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20471 return SDValue();
20472
20473 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20474 if (Subtarget.hasAVX512() && NumStages > 1)
20475 return SDValue();
20476
20477 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20478 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20479
20480 // Truncate with PACKUS if we are truncating a vector with leading zero
20481 // bits that extend all the way to the packed/truncated value.
20482 // e.g. Masks, zext_in_reg, etc.
20483 // Pre-SSE41 we can only use PACKUSWB.
20484 KnownBits Known = DAG.computeKnownBits(In);
20485 if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20486 PackOpcode = X86ISD::PACKUS;
20487 return In;
20488 }
20489
20490 // Truncate with PACKSS if we are truncating a vector with sign-bits
20491 // that extend all the way to the packed/truncated value.
20492 // e.g. Comparison result, sext_in_reg, etc.
20493 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20494
20495 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20496 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20497 // see through BITCASTs later on and combines/simplifications can't then use
20498 // it.
20499 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20500 !Subtarget.hasAVX512())
20501 return SDValue();
20502
20503 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20504 if (MinSignBits < NumSignBits) {
20505 PackOpcode = X86ISD::PACKSS;
20506 return In;
20507 }
20508
20509 // If we have a srl that only generates signbits that we will discard in
20510 // the truncation then we can use PACKSS by converting the srl to a sra.
20511 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20512 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20513 if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
20514 if (*ShAmt == MinSignBits) {
20515 PackOpcode = X86ISD::PACKSS;
20516 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20517 }
20518 }
20519
20520 return SDValue();
20521}
20522
20523/// This function lowers a vector truncation of 'extended sign-bits' or
20524/// 'extended zero-bits' values.
20525/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20527 const SDLoc &DL,
20528 const X86Subtarget &Subtarget,
20529 SelectionDAG &DAG) {
20530 MVT SrcVT = In.getSimpleValueType();
20531 MVT DstSVT = DstVT.getVectorElementType();
20532 MVT SrcSVT = SrcVT.getVectorElementType();
20533 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20534 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20535 return SDValue();
20536
20537 // If the upper half of the source is undef, then attempt to split and
20538 // only truncate the lower half.
20539 if (DstVT.getSizeInBits() >= 128) {
20540 SmallVector<SDValue> LowerOps;
20541 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20542 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20543 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20544 Subtarget, DAG))
20545 return widenSubVector(Res, false, Subtarget, DAG, DL,
20546 DstVT.getSizeInBits());
20547 }
20548 }
20549
20550 unsigned PackOpcode;
20551 if (SDValue Src =
20552 matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20553 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20554
20555 return SDValue();
20556}
20557
20558/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20559/// X86ISD::PACKUS/X86ISD::PACKSS operations.
20561 const X86Subtarget &Subtarget,
20562 SelectionDAG &DAG) {
20563 MVT SrcVT = In.getSimpleValueType();
20564 MVT DstSVT = DstVT.getVectorElementType();
20565 MVT SrcSVT = SrcVT.getVectorElementType();
20566 unsigned NumElems = DstVT.getVectorNumElements();
20567 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20568 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20569 NumElems >= 8))
20570 return SDValue();
20571
20572 // SSSE3's pshufb results in less instructions in the cases below.
20573 if (Subtarget.hasSSSE3() && NumElems == 8) {
20574 if (SrcSVT == MVT::i16)
20575 return SDValue();
20576 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20577 return SDValue();
20578 }
20579
20580 // If the upper half of the source is undef, then attempt to split and
20581 // only truncate the lower half.
20582 if (DstVT.getSizeInBits() >= 128) {
20583 SmallVector<SDValue> LowerOps;
20584 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20585 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20586 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20587 return widenSubVector(Res, false, Subtarget, DAG, DL,
20588 DstVT.getSizeInBits());
20589 }
20590 }
20591
20592 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20593 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20594 // truncate 2 x v4i32 to v8i16.
20595 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20596 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20597
20598 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20599 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20600
20601 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20602 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20603 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20604 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20605 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20606 }
20607
20608 return SDValue();
20609}
20610
20612 SelectionDAG &DAG,
20613 const X86Subtarget &Subtarget) {
20614 MVT VT = Op.getSimpleValueType();
20615 SDValue In = Op.getOperand(0);
20616 MVT InVT = In.getSimpleValueType();
20617 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
20618
20619 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20620 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20621 if (InVT.getScalarSizeInBits() <= 16) {
20622 if (Subtarget.hasBWI()) {
20623 // legal, will go to VPMOVB2M, VPMOVW2M
20624 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20625 // We need to shift to get the lsb into sign position.
20626 // Shift packed bytes not supported natively, bitcast to word
20627 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20628 In = DAG.getNode(ISD::SHL, DL, ExtVT,
20629 DAG.getBitcast(ExtVT, In),
20630 DAG.getConstant(ShiftInx, DL, ExtVT));
20631 In = DAG.getBitcast(InVT, In);
20632 }
20633 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
20634 In, ISD::SETGT);
20635 }
20636 // Use TESTD/Q, extended vector to packed dword/qword.
20637 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
20638 "Unexpected vector type.");
20639 unsigned NumElts = InVT.getVectorNumElements();
20640 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
20641 // We need to change to a wider element type that we have support for.
20642 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
20643 // For 16 element vectors we extend to v16i32 unless we are explicitly
20644 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20645 // we need to split into two 8 element vectors which we can extend to v8i32,
20646 // truncate and concat the results. There's an additional complication if
20647 // the original type is v16i8. In that case we can't split the v16i8
20648 // directly, so we need to shuffle high elements to low and use
20649 // sign_extend_vector_inreg.
20650 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
20651 SDValue Lo, Hi;
20652 if (InVT == MVT::v16i8) {
20653 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
20654 Hi = DAG.getVectorShuffle(
20655 InVT, DL, In, In,
20656 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
20657 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
20658 } else {
20659 assert(InVT == MVT::v16i16 && "Unexpected VT!");
20660 Lo = extract128BitVector(In, 0, DAG, DL);
20661 Hi = extract128BitVector(In, 8, DAG, DL);
20662 }
20663 // We're split now, just emit two truncates and a concat. The two
20664 // truncates will trigger legalization to come back to this function.
20665 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
20666 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
20667 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20668 }
20669 // We either have 8 elements or we're allowed to use 512-bit vectors.
20670 // If we have VLX, we want to use the narrowest vector that can get the
20671 // job done so we use vXi32.
20672 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
20673 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
20674 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
20675 InVT = ExtVT;
20676 ShiftInx = InVT.getScalarSizeInBits() - 1;
20677 }
20678
20679 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20680 // We need to shift to get the lsb into sign position.
20681 In = DAG.getNode(ISD::SHL, DL, InVT, In,
20682 DAG.getConstant(ShiftInx, DL, InVT));
20683 }
20684 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
20685 if (Subtarget.hasDQI())
20686 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
20687 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
20688}
20689
20690SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
20691 SDLoc DL(Op);
20692 MVT VT = Op.getSimpleValueType();
20693 SDValue In = Op.getOperand(0);
20694 MVT InVT = In.getSimpleValueType();
20696 "Invalid TRUNCATE operation");
20697
20698 // If we're called by the type legalizer, handle a few cases.
20699 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20700 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
20701 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
20702 VT.is128BitVector() && Subtarget.hasAVX512()) {
20703 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
20704 "Unexpected subtarget!");
20705 // The default behavior is to truncate one step, concatenate, and then
20706 // truncate the remainder. We'd rather produce two 64-bit results and
20707 // concatenate those.
20708 SDValue Lo, Hi;
20709 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
20710
20711 EVT LoVT, HiVT;
20712 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
20713
20714 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
20715 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
20716 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20717 }
20718
20719 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
20720 if (!Subtarget.hasAVX512() ||
20721 (InVT.is512BitVector() && VT.is256BitVector()))
20722 if (SDValue SignPack =
20723 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20724 return SignPack;
20725
20726 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
20727 if (!Subtarget.hasAVX512())
20728 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
20729
20730 // Otherwise let default legalization handle it.
20731 return SDValue();
20732 }
20733
20734 if (VT.getVectorElementType() == MVT::i1)
20735 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
20736
20737 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
20738 // concat from subvectors to use VPTRUNC etc.
20739 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
20740 if (SDValue SignPack =
20741 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20742 return SignPack;
20743
20744 // vpmovqb/w/d, vpmovdb/w, vpmovwb
20745 if (Subtarget.hasAVX512()) {
20746 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
20747 assert(VT == MVT::v32i8 && "Unexpected VT!");
20748 return splitVectorIntUnary(Op, DAG, DL);
20749 }
20750
20751 // word to byte only under BWI. Otherwise we have to promoted to v16i32
20752 // and then truncate that. But we should only do that if we haven't been
20753 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20754 // handled by isel patterns.
20755 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
20756 Subtarget.canExtendTo512DQ())
20757 return Op;
20758 }
20759
20760 // Handle truncation of V256 to V128 using shuffles.
20761 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
20762
20763 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
20764 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20765 if (Subtarget.hasInt256()) {
20766 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20767 In = DAG.getBitcast(MVT::v8i32, In);
20768 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
20769 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
20770 DAG.getIntPtrConstant(0, DL));
20771 }
20772
20773 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20774 DAG.getIntPtrConstant(0, DL));
20775 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20776 DAG.getIntPtrConstant(2, DL));
20777 static const int ShufMask[] = {0, 2, 4, 6};
20778 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
20779 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
20780 }
20781
20782 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
20783 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20784 if (Subtarget.hasInt256()) {
20785 // The PSHUFB mask:
20786 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
20787 -1, -1, -1, -1, -1, -1, -1, -1,
20788 16, 17, 20, 21, 24, 25, 28, 29,
20789 -1, -1, -1, -1, -1, -1, -1, -1 };
20790 In = DAG.getBitcast(MVT::v32i8, In);
20791 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
20792 In = DAG.getBitcast(MVT::v4i64, In);
20793
20794 static const int ShufMask2[] = {0, 2, -1, -1};
20795 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
20796 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20797 DAG.getIntPtrConstant(0, DL));
20798 return DAG.getBitcast(MVT::v8i16, In);
20799 }
20800
20801 return Subtarget.hasSSE41()
20802 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
20803 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
20804 }
20805
20806 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
20807 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
20808
20809 llvm_unreachable("All 256->128 cases should have been handled above!");
20810}
20811
20812// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
20813// behaves on out of range inputs to generate optimized conversions.
20815 SelectionDAG &DAG,
20816 const X86Subtarget &Subtarget) {
20817 MVT SrcVT = Src.getSimpleValueType();
20818 unsigned DstBits = VT.getScalarSizeInBits();
20819 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
20820
20821 // Calculate the converted result for values in the range 0 to
20822 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
20823 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
20824 SDValue Big =
20825 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
20826 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
20827 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
20828
20829 // The "CVTTP2SI" instruction conveniently sets the sign bit if
20830 // and only if the value was out of range. So we can use that
20831 // as our indicator that we rather use "Big" instead of "Small".
20832 //
20833 // Use "Small" if "IsOverflown" has all bits cleared
20834 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
20835
20836 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
20837 // use the slightly slower blendv select instead.
20838 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
20839 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
20840 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
20841 }
20842
20843 SDValue IsOverflown =
20844 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
20845 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
20846 return DAG.getNode(ISD::OR, dl, VT, Small,
20847 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
20848}
20849
20850SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
20851 bool IsStrict = Op->isStrictFPOpcode();
20852 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
20853 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
20854 MVT VT = Op->getSimpleValueType(0);
20855 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20856 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
20857 MVT SrcVT = Src.getSimpleValueType();
20858 SDLoc dl(Op);
20859
20860 SDValue Res;
20861 if (isSoftF16(SrcVT, Subtarget)) {
20862 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20863 if (IsStrict)
20864 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
20865 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
20866 {NVT, MVT::Other}, {Chain, Src})});
20867 return DAG.getNode(Op.getOpcode(), dl, VT,
20868 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
20869 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
20870 return Op;
20871 }
20872
20873 if (VT.isVector()) {
20874 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
20875 MVT ResVT = MVT::v4i32;
20876 MVT TruncVT = MVT::v4i1;
20877 unsigned Opc;
20878 if (IsStrict)
20880 else
20881 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20882
20883 if (!IsSigned && !Subtarget.hasVLX()) {
20884 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
20885 // Widen to 512-bits.
20886 ResVT = MVT::v8i32;
20887 TruncVT = MVT::v8i1;
20888 Opc = Op.getOpcode();
20889 // Need to concat with zero vector for strict fp to avoid spurious
20890 // exceptions.
20891 // TODO: Should we just do this for non-strict as well?
20892 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
20893 : DAG.getUNDEF(MVT::v8f64);
20894 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
20895 DAG.getIntPtrConstant(0, dl));
20896 }
20897 if (IsStrict) {
20898 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
20899 Chain = Res.getValue(1);
20900 } else {
20901 Res = DAG.getNode(Opc, dl, ResVT, Src);
20902 }
20903
20904 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
20905 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
20906 DAG.getIntPtrConstant(0, dl));
20907 if (IsStrict)
20908 return DAG.getMergeValues({Res, Chain}, dl);
20909 return Res;
20910 }
20911
20912 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
20913 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
20914 return Op;
20915
20916 MVT ResVT = VT;
20917 MVT EleVT = VT.getVectorElementType();
20918 if (EleVT != MVT::i64)
20919 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
20920
20921 if (SrcVT != MVT::v8f16) {
20922 SDValue Tmp =
20923 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
20924 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
20925 Ops[0] = Src;
20926 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
20927 }
20928
20929 if (IsStrict) {
20930 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
20932 dl, {ResVT, MVT::Other}, {Chain, Src});
20933 Chain = Res.getValue(1);
20934 } else {
20935 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
20936 ResVT, Src);
20937 }
20938
20939 // TODO: Need to add exception check code for strict FP.
20940 if (EleVT.getSizeInBits() < 16) {
20941 ResVT = MVT::getVectorVT(EleVT, 8);
20942 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
20943 }
20944
20945 if (ResVT != VT)
20946 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20947 DAG.getIntPtrConstant(0, dl));
20948
20949 if (IsStrict)
20950 return DAG.getMergeValues({Res, Chain}, dl);
20951 return Res;
20952 }
20953
20954 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
20955 if (VT.getVectorElementType() == MVT::i16) {
20956 assert((SrcVT.getVectorElementType() == MVT::f32 ||
20957 SrcVT.getVectorElementType() == MVT::f64) &&
20958 "Expected f32/f64 vector!");
20959 MVT NVT = VT.changeVectorElementType(MVT::i32);
20960 if (IsStrict) {
20961 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
20963 dl, {NVT, MVT::Other}, {Chain, Src});
20964 Chain = Res.getValue(1);
20965 } else {
20966 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
20967 NVT, Src);
20968 }
20969
20970 // TODO: Need to add exception check code for strict FP.
20971 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20972
20973 if (IsStrict)
20974 return DAG.getMergeValues({Res, Chain}, dl);
20975 return Res;
20976 }
20977
20978 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
20979 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
20980 assert(!IsSigned && "Expected unsigned conversion!");
20981 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
20982 return Op;
20983 }
20984
20985 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
20986 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
20987 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
20988 Subtarget.useAVX512Regs()) {
20989 assert(!IsSigned && "Expected unsigned conversion!");
20990 assert(!Subtarget.hasVLX() && "Unexpected features!");
20991 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20992 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20993 // Need to concat with zero vector for strict fp to avoid spurious
20994 // exceptions.
20995 // TODO: Should we just do this for non-strict as well?
20996 SDValue Tmp =
20997 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20998 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20999 DAG.getIntPtrConstant(0, dl));
21000
21001 if (IsStrict) {
21002 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21003 {Chain, Src});
21004 Chain = Res.getValue(1);
21005 } else {
21006 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21007 }
21008
21009 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21010 DAG.getIntPtrConstant(0, dl));
21011
21012 if (IsStrict)
21013 return DAG.getMergeValues({Res, Chain}, dl);
21014 return Res;
21015 }
21016
21017 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21018 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21019 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21020 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21021 assert(!Subtarget.hasVLX() && "Unexpected features!");
21022 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21023 // Need to concat with zero vector for strict fp to avoid spurious
21024 // exceptions.
21025 // TODO: Should we just do this for non-strict as well?
21026 SDValue Tmp =
21027 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21028 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21029 DAG.getIntPtrConstant(0, dl));
21030
21031 if (IsStrict) {
21032 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21033 {Chain, Src});
21034 Chain = Res.getValue(1);
21035 } else {
21036 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21037 }
21038
21039 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21040 DAG.getIntPtrConstant(0, dl));
21041
21042 if (IsStrict)
21043 return DAG.getMergeValues({Res, Chain}, dl);
21044 return Res;
21045 }
21046
21047 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21048 if (!Subtarget.hasVLX()) {
21049 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21050 // legalizer and then widened again by vector op legalization.
21051 if (!IsStrict)
21052 return SDValue();
21053
21054 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21055 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21056 {Src, Zero, Zero, Zero});
21057 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21058 {Chain, Tmp});
21059 SDValue Chain = Tmp.getValue(1);
21060 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21061 DAG.getIntPtrConstant(0, dl));
21062 return DAG.getMergeValues({Tmp, Chain}, dl);
21063 }
21064
21065 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21066 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21067 DAG.getUNDEF(MVT::v2f32));
21068 if (IsStrict) {
21069 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21071 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21072 }
21073 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21074 return DAG.getNode(Opc, dl, VT, Tmp);
21075 }
21076
21077 // Generate optimized instructions for pre AVX512 unsigned conversions from
21078 // vXf32 to vXi32.
21079 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21080 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21081 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21082 assert(!IsSigned && "Expected unsigned conversion!");
21083 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21084 }
21085
21086 return SDValue();
21087 }
21088
21089 assert(!VT.isVector());
21090
21091 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21092
21093 if (!IsSigned && UseSSEReg) {
21094 // Conversions from f32/f64 with AVX512 should be legal.
21095 if (Subtarget.hasAVX512())
21096 return Op;
21097
21098 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21099 // behaves on out of range inputs to generate optimized conversions.
21100 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21101 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21102 unsigned DstBits = VT.getScalarSizeInBits();
21103 APInt UIntLimit = APInt::getSignMask(DstBits);
21104 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21105 DAG.getConstant(UIntLimit, dl, VT));
21106 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21107
21108 // Calculate the converted result for values in the range:
21109 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21110 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21111 SDValue Small =
21112 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21113 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21114 SDValue Big = DAG.getNode(
21115 X86ISD::CVTTS2SI, dl, VT,
21116 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21117 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21118
21119 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21120 // and only if the value was out of range. So we can use that
21121 // as our indicator that we rather use "Big" instead of "Small".
21122 //
21123 // Use "Small" if "IsOverflown" has all bits cleared
21124 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21125 SDValue IsOverflown = DAG.getNode(
21126 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21127 return DAG.getNode(ISD::OR, dl, VT, Small,
21128 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21129 }
21130
21131 // Use default expansion for i64.
21132 if (VT == MVT::i64)
21133 return SDValue();
21134
21135 assert(VT == MVT::i32 && "Unexpected VT!");
21136
21137 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21138 // FIXME: This does not generate an invalid exception if the input does not
21139 // fit in i32. PR44019
21140 if (Subtarget.is64Bit()) {
21141 if (IsStrict) {
21142 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21143 {Chain, Src});
21144 Chain = Res.getValue(1);
21145 } else
21146 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21147
21148 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21149 if (IsStrict)
21150 return DAG.getMergeValues({Res, Chain}, dl);
21151 return Res;
21152 }
21153
21154 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21155 // use fisttp which will be handled later.
21156 if (!Subtarget.hasSSE3())
21157 return SDValue();
21158 }
21159
21160 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21161 // FIXME: This does not generate an invalid exception if the input does not
21162 // fit in i16. PR44019
21163 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21164 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21165 if (IsStrict) {
21166 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21167 {Chain, Src});
21168 Chain = Res.getValue(1);
21169 } else
21170 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21171
21172 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21173 if (IsStrict)
21174 return DAG.getMergeValues({Res, Chain}, dl);
21175 return Res;
21176 }
21177
21178 // If this is a FP_TO_SINT using SSEReg we're done.
21179 if (UseSSEReg && IsSigned)
21180 return Op;
21181
21182 // fp128 needs to use a libcall.
21183 if (SrcVT == MVT::f128) {
21184 RTLIB::Libcall LC;
21185 if (IsSigned)
21186 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21187 else
21188 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21189
21190 MakeLibCallOptions CallOptions;
21191 std::pair<SDValue, SDValue> Tmp =
21192 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21193
21194 if (IsStrict)
21195 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21196
21197 return Tmp.first;
21198 }
21199
21200 // Fall back to X87.
21201 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21202 if (IsStrict)
21203 return DAG.getMergeValues({V, Chain}, dl);
21204 return V;
21205 }
21206
21207 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21208}
21209
21210SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21211 SelectionDAG &DAG) const {
21212 SDValue Src = Op.getOperand(0);
21213 EVT DstVT = Op.getSimpleValueType();
21214 MVT SrcVT = Src.getSimpleValueType();
21215
21216 if (SrcVT.isVector())
21217 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21218
21219 if (SrcVT == MVT::f16)
21220 return SDValue();
21221
21222 // If the source is in an SSE register, the node is Legal.
21223 if (isScalarFPTypeInSSEReg(SrcVT))
21224 return Op;
21225
21226 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21227}
21228
21229SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21230 SelectionDAG &DAG) const {
21231 EVT DstVT = N->getValueType(0);
21232 SDValue Src = N->getOperand(0);
21233 EVT SrcVT = Src.getValueType();
21234
21235 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21236 // f16 must be promoted before using the lowering in this routine.
21237 // fp128 does not use this lowering.
21238 return SDValue();
21239 }
21240
21241 SDLoc DL(N);
21242 SDValue Chain = DAG.getEntryNode();
21243
21244 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21245
21246 // If we're converting from SSE, the stack slot needs to hold both types.
21247 // Otherwise it only needs to hold the DstVT.
21248 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21249 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21250 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21251 MachinePointerInfo MPI =
21253
21254 if (UseSSE) {
21255 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21256 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21257 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21258 SDValue Ops[] = { Chain, StackPtr };
21259
21260 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21261 /*Align*/ std::nullopt,
21263 Chain = Src.getValue(1);
21264 }
21265
21266 SDValue StoreOps[] = { Chain, Src, StackPtr };
21267 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21268 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
21270
21271 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21272}
21273
21274SDValue
21275X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21276 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21277 // but making use of X86 specifics to produce better instruction sequences.
21278 SDNode *Node = Op.getNode();
21279 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21280 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21281 SDLoc dl(SDValue(Node, 0));
21282 SDValue Src = Node->getOperand(0);
21283
21284 // There are three types involved here: SrcVT is the source floating point
21285 // type, DstVT is the type of the result, and TmpVT is the result of the
21286 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21287 // DstVT).
21288 EVT SrcVT = Src.getValueType();
21289 EVT DstVT = Node->getValueType(0);
21290 EVT TmpVT = DstVT;
21291
21292 // This code is only for floats and doubles. Fall back to generic code for
21293 // anything else.
21294 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
21295 return SDValue();
21296
21297 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21298 unsigned SatWidth = SatVT.getScalarSizeInBits();
21299 unsigned DstWidth = DstVT.getScalarSizeInBits();
21300 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21301 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21302 "Expected saturation width smaller than result width");
21303
21304 // Promote result of FP_TO_*INT to at least 32 bits.
21305 if (TmpWidth < 32) {
21306 TmpVT = MVT::i32;
21307 TmpWidth = 32;
21308 }
21309
21310 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21311 // us to use a native signed conversion instead.
21312 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21313 TmpVT = MVT::i64;
21314 TmpWidth = 64;
21315 }
21316
21317 // If the saturation width is smaller than the size of the temporary result,
21318 // we can always use signed conversion, which is native.
21319 if (SatWidth < TmpWidth)
21320 FpToIntOpcode = ISD::FP_TO_SINT;
21321
21322 // Determine minimum and maximum integer values and their corresponding
21323 // floating-point values.
21324 APInt MinInt, MaxInt;
21325 if (IsSigned) {
21326 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
21327 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
21328 } else {
21329 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
21330 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
21331 }
21332
21333 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21334 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21335
21336 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21337 MinInt, IsSigned, APFloat::rmTowardZero);
21338 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21339 MaxInt, IsSigned, APFloat::rmTowardZero);
21340 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21341 && !(MaxStatus & APFloat::opStatus::opInexact);
21342
21343 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21344 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21345
21346 // If the integer bounds are exactly representable as floats, emit a
21347 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21348 if (AreExactFloatBounds) {
21349 if (DstVT != TmpVT) {
21350 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21351 SDValue MinClamped = DAG.getNode(
21352 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21353 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21354 SDValue BothClamped = DAG.getNode(
21355 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21356 // Convert clamped value to integer.
21357 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21358
21359 // NaN will become INDVAL, with the top bit set and the rest zero.
21360 // Truncation will discard the top bit, resulting in zero.
21361 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21362 }
21363
21364 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21365 SDValue MinClamped = DAG.getNode(
21366 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21367 // Clamp by MaxFloat from above. NaN cannot occur.
21368 SDValue BothClamped = DAG.getNode(
21369 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21370 // Convert clamped value to integer.
21371 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21372
21373 if (!IsSigned) {
21374 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21375 // which is zero.
21376 return FpToInt;
21377 }
21378
21379 // Otherwise, select zero if Src is NaN.
21380 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21381 return DAG.getSelectCC(
21382 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21383 }
21384
21385 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21386 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21387
21388 // Result of direct conversion, which may be selected away.
21389 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21390
21391 if (DstVT != TmpVT) {
21392 // NaN will become INDVAL, with the top bit set and the rest zero.
21393 // Truncation will discard the top bit, resulting in zero.
21394 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21395 }
21396
21397 SDValue Select = FpToInt;
21398 // For signed conversions where we saturate to the same size as the
21399 // result type of the fptoi instructions, INDVAL coincides with integer
21400 // minimum, so we don't need to explicitly check it.
21401 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21402 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21403 // MinInt if Src is NaN.
21404 Select = DAG.getSelectCC(
21405 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21406 }
21407
21408 // If Src OGT MaxFloat, select MaxInt.
21409 Select = DAG.getSelectCC(
21410 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21411
21412 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21413 // is already zero. The promoted case was already handled above.
21414 if (!IsSigned || DstVT != TmpVT) {
21415 return Select;
21416 }
21417
21418 // Otherwise, select 0 if Src is NaN.
21419 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21420 return DAG.getSelectCC(
21421 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21422}
21423
21424SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21425 bool IsStrict = Op->isStrictFPOpcode();
21426
21427 SDLoc DL(Op);
21428 MVT VT = Op.getSimpleValueType();
21429 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21430 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21431 MVT SVT = In.getSimpleValueType();
21432
21433 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21434 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
21435 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
21436 !Subtarget.getTargetTriple().isOSDarwin()))
21437 return SDValue();
21438
21439 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
21440 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
21441 return Op;
21442
21443 if (SVT == MVT::f16) {
21444 if (Subtarget.hasFP16())
21445 return Op;
21446
21447 if (VT != MVT::f32) {
21448 if (IsStrict)
21449 return DAG.getNode(
21450 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
21451 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
21452 {MVT::f32, MVT::Other}, {Chain, In})});
21453
21454 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21455 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
21456 }
21457
21458 if (!Subtarget.hasF16C()) {
21459 if (!Subtarget.getTargetTriple().isOSDarwin())
21460 return SDValue();
21461
21462 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
21463
21464 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21466 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21467
21468 In = DAG.getBitcast(MVT::i16, In);
21471 Entry.Node = In;
21472 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
21473 Entry.IsSExt = false;
21474 Entry.IsZExt = true;
21475 Args.push_back(Entry);
21476
21478 getLibcallName(RTLIB::FPEXT_F16_F32),
21480 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21481 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
21482 std::move(Args));
21483
21484 SDValue Res;
21485 std::tie(Res,Chain) = LowerCallTo(CLI);
21486 if (IsStrict)
21487 Res = DAG.getMergeValues({Res, Chain}, DL);
21488
21489 return Res;
21490 }
21491
21492 In = DAG.getBitcast(MVT::i16, In);
21493 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
21494 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
21495 DAG.getIntPtrConstant(0, DL));
21496 SDValue Res;
21497 if (IsStrict) {
21498 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
21499 {Chain, In});
21500 Chain = Res.getValue(1);
21501 } else {
21502 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
21503 DAG.getTargetConstant(4, DL, MVT::i32));
21504 }
21505 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
21506 DAG.getIntPtrConstant(0, DL));
21507 if (IsStrict)
21508 return DAG.getMergeValues({Res, Chain}, DL);
21509 return Res;
21510 }
21511
21512 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
21513 return Op;
21514
21515 if (SVT.getVectorElementType() == MVT::f16) {
21516 if (Subtarget.hasFP16() && isTypeLegal(SVT))
21517 return Op;
21518 assert(Subtarget.hasF16C() && "Unexpected features!");
21519 if (SVT == MVT::v2f16)
21520 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
21521 DAG.getUNDEF(MVT::v2f16));
21522 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
21523 DAG.getUNDEF(MVT::v4f16));
21524 if (IsStrict)
21525 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21526 {Op->getOperand(0), Res});
21527 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21528 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
21529 return Op;
21530 }
21531
21532 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21533
21534 SDValue Res =
21535 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21536 if (IsStrict)
21537 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21538 {Op->getOperand(0), Res});
21539 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21540}
21541
21542SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21543 bool IsStrict = Op->isStrictFPOpcode();
21544
21545 SDLoc DL(Op);
21546 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21547 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21548 MVT VT = Op.getSimpleValueType();
21549 MVT SVT = In.getSimpleValueType();
21550
21551 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
21552 return SDValue();
21553
21554 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
21555 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
21556 if (!Subtarget.getTargetTriple().isOSDarwin())
21557 return SDValue();
21558
21559 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
21561 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21562
21565 Entry.Node = In;
21566 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
21567 Entry.IsSExt = false;
21568 Entry.IsZExt = true;
21569 Args.push_back(Entry);
21570
21572 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
21573 : RTLIB::FPROUND_F32_F16),
21575 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21576 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
21577 std::move(Args));
21578
21579 SDValue Res;
21580 std::tie(Res, Chain) = LowerCallTo(CLI);
21581
21582 Res = DAG.getBitcast(MVT::f16, Res);
21583
21584 if (IsStrict)
21585 Res = DAG.getMergeValues({Res, Chain}, DL);
21586
21587 return Res;
21588 }
21589
21590 if (VT.getScalarType() == MVT::bf16) {
21591 if (SVT.getScalarType() == MVT::f32 &&
21592 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21593 Subtarget.hasAVXNECONVERT()))
21594 return Op;
21595 return SDValue();
21596 }
21597
21598 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
21599 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
21600 return SDValue();
21601
21602 if (VT.isVector())
21603 return Op;
21604
21605 SDValue Res;
21607 MVT::i32);
21608 if (IsStrict) {
21609 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
21610 DAG.getConstantFP(0, DL, MVT::v4f32), In,
21611 DAG.getIntPtrConstant(0, DL));
21612 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
21613 {Chain, Res, Rnd});
21614 Chain = Res.getValue(1);
21615 } else {
21616 // FIXME: Should we use zeros for upper elements for non-strict?
21617 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
21618 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
21619 }
21620
21621 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21622 DAG.getIntPtrConstant(0, DL));
21623 Res = DAG.getBitcast(MVT::f16, Res);
21624
21625 if (IsStrict)
21626 return DAG.getMergeValues({Res, Chain}, DL);
21627
21628 return Res;
21629 }
21630
21631 return Op;
21632}
21633
21635 bool IsStrict = Op->isStrictFPOpcode();
21636 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21637 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21638 "Unexpected VT!");
21639
21640 SDLoc dl(Op);
21641 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21642 DAG.getConstant(0, dl, MVT::v8i16), Src,
21643 DAG.getIntPtrConstant(0, dl));
21644
21645 SDValue Chain;
21646 if (IsStrict) {
21647 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21648 {Op.getOperand(0), Res});
21649 Chain = Res.getValue(1);
21650 } else {
21651 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21652 }
21653
21654 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21655 DAG.getIntPtrConstant(0, dl));
21656
21657 if (IsStrict)
21658 return DAG.getMergeValues({Res, Chain}, dl);
21659
21660 return Res;
21661}
21662
21664 bool IsStrict = Op->isStrictFPOpcode();
21665 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21666 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21667 "Unexpected VT!");
21668
21669 SDLoc dl(Op);
21670 SDValue Res, Chain;
21671 if (IsStrict) {
21672 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21673 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21674 DAG.getIntPtrConstant(0, dl));
21675 Res = DAG.getNode(
21676 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21677 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21678 Chain = Res.getValue(1);
21679 } else {
21680 // FIXME: Should we use zeros for upper elements for non-strict?
21681 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21682 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21683 DAG.getTargetConstant(4, dl, MVT::i32));
21684 }
21685
21686 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21687 DAG.getIntPtrConstant(0, dl));
21688
21689 if (IsStrict)
21690 return DAG.getMergeValues({Res, Chain}, dl);
21691
21692 return Res;
21693}
21694
21695SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
21696 SelectionDAG &DAG) const {
21697 SDLoc DL(Op);
21698
21699 MVT SVT = Op.getOperand(0).getSimpleValueType();
21700 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21701 Subtarget.hasAVXNECONVERT())) {
21702 SDValue Res;
21703 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
21704 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
21705 Res = DAG.getBitcast(MVT::v8i16, Res);
21706 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21707 DAG.getIntPtrConstant(0, DL));
21708 }
21709
21710 MakeLibCallOptions CallOptions;
21711 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
21712 SDValue Res =
21713 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
21714 return DAG.getBitcast(MVT::i16, Res);
21715}
21716
21717/// Depending on uarch and/or optimizing for size, we might prefer to use a
21718/// vector operation in place of the typical scalar operation.
21720 SelectionDAG &DAG,
21721 const X86Subtarget &Subtarget) {
21722 // If both operands have other uses, this is probably not profitable.
21723 SDValue LHS = Op.getOperand(0);
21724 SDValue RHS = Op.getOperand(1);
21725 if (!LHS.hasOneUse() && !RHS.hasOneUse())
21726 return Op;
21727
21728 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21729 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21730 if (IsFP && !Subtarget.hasSSE3())
21731 return Op;
21732 if (!IsFP && !Subtarget.hasSSSE3())
21733 return Op;
21734
21735 // Extract from a common vector.
21736 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21737 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21738 LHS.getOperand(0) != RHS.getOperand(0) ||
21739 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21740 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21741 !shouldUseHorizontalOp(true, DAG, Subtarget))
21742 return Op;
21743
21744 // Allow commuted 'hadd' ops.
21745 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21746 unsigned HOpcode;
21747 switch (Op.getOpcode()) {
21748 // clang-format off
21749 case ISD::ADD: HOpcode = X86ISD::HADD; break;
21750 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21751 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21752 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21753 default:
21754 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21755 // clang-format on
21756 }
21757 unsigned LExtIndex = LHS.getConstantOperandVal(1);
21758 unsigned RExtIndex = RHS.getConstantOperandVal(1);
21759 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21760 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21761 std::swap(LExtIndex, RExtIndex);
21762
21763 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21764 return Op;
21765
21766 SDValue X = LHS.getOperand(0);
21767 EVT VecVT = X.getValueType();
21768 unsigned BitWidth = VecVT.getSizeInBits();
21769 unsigned NumLanes = BitWidth / 128;
21770 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21771 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21772 "Not expecting illegal vector widths here");
21773
21774 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21775 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21776 if (BitWidth == 256 || BitWidth == 512) {
21777 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21778 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21779 LExtIndex %= NumEltsPerLane;
21780 }
21781
21782 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21783 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21784 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21785 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21786 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21787 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21788 DAG.getIntPtrConstant(LExtIndex / 2, DL));
21789}
21790
21791/// Depending on uarch and/or optimizing for size, we might prefer to use a
21792/// vector operation in place of the typical scalar operation.
21793SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21794 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21795 "Only expecting float/double");
21796 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
21797}
21798
21799/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21800/// This mode isn't supported in hardware on X86. But as long as we aren't
21801/// compiling with trapping math, we can emulate this with
21802/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
21804 SDValue N0 = Op.getOperand(0);
21805 SDLoc dl(Op);
21806 MVT VT = Op.getSimpleValueType();
21807
21808 // N0 += copysign(nextafter(0.5, 0.0), N0)
21810 bool Ignored;
21811 APFloat Point5Pred = APFloat(0.5f);
21812 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
21813 Point5Pred.next(/*nextDown*/true);
21814
21815 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
21816 DAG.getConstantFP(Point5Pred, dl, VT), N0);
21817 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
21818
21819 // Truncate the result to remove fraction.
21820 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
21821}
21822
21823/// The only differences between FABS and FNEG are the mask and the logic op.
21824/// FNEG also has a folding opportunity for FNEG(FABS(x)).
21826 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
21827 "Wrong opcode for lowering FABS or FNEG.");
21828
21829 bool IsFABS = (Op.getOpcode() == ISD::FABS);
21830
21831 // If this is a FABS and it has an FNEG user, bail out to fold the combination
21832 // into an FNABS. We'll lower the FABS after that if it is still in use.
21833 if (IsFABS)
21834 for (SDNode *User : Op->uses())
21835 if (User->getOpcode() == ISD::FNEG)
21836 return Op;
21837
21838 SDLoc dl(Op);
21839 MVT VT = Op.getSimpleValueType();
21840
21841 bool IsF128 = (VT == MVT::f128);
21842 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21844 "Unexpected type in LowerFABSorFNEG");
21845
21846 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
21847 // decide if we should generate a 16-byte constant mask when we only need 4 or
21848 // 8 bytes for the scalar case.
21849
21850 // There are no scalar bitwise logical SSE/AVX instructions, so we
21851 // generate a 16-byte vector constant and logic op even for the scalar case.
21852 // Using a 16-byte mask allows folding the load of the mask with
21853 // the logic op, so it can save (~4 bytes) on code size.
21854 bool IsFakeVector = !VT.isVector() && !IsF128;
21855 MVT LogicVT = VT;
21856 if (IsFakeVector)
21857 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21858 : (VT == MVT::f32) ? MVT::v4f32
21859 : MVT::v8f16;
21860
21861 unsigned EltBits = VT.getScalarSizeInBits();
21862 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
21863 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
21864 APInt::getSignMask(EltBits);
21866 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
21867
21868 SDValue Op0 = Op.getOperand(0);
21869 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
21870 unsigned LogicOp = IsFABS ? X86ISD::FAND :
21871 IsFNABS ? X86ISD::FOR :
21873 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
21874
21875 if (VT.isVector() || IsF128)
21876 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21877
21878 // For the scalar case extend to a 128-bit vector, perform the logic op,
21879 // and extract the scalar result back out.
21880 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
21881 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21882 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
21883 DAG.getIntPtrConstant(0, dl));
21884}
21885
21887 SDValue Mag = Op.getOperand(0);
21888 SDValue Sign = Op.getOperand(1);
21889 SDLoc dl(Op);
21890
21891 // If the sign operand is smaller, extend it first.
21892 MVT VT = Op.getSimpleValueType();
21893 if (Sign.getSimpleValueType().bitsLT(VT))
21894 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
21895
21896 // And if it is bigger, shrink it first.
21897 if (Sign.getSimpleValueType().bitsGT(VT))
21898 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
21899 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21900
21901 // At this point the operands and the result should have the same
21902 // type, and that won't be f80 since that is not custom lowered.
21903 bool IsF128 = (VT == MVT::f128);
21904 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21906 "Unexpected type in LowerFCOPYSIGN");
21907
21909
21910 // Perform all scalar logic operations as 16-byte vectors because there are no
21911 // scalar FP logic instructions in SSE.
21912 // TODO: This isn't necessary. If we used scalar types, we might avoid some
21913 // unnecessary splats, but we might miss load folding opportunities. Should
21914 // this decision be based on OptimizeForSize?
21915 bool IsFakeVector = !VT.isVector() && !IsF128;
21916 MVT LogicVT = VT;
21917 if (IsFakeVector)
21918 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21919 : (VT == MVT::f32) ? MVT::v4f32
21920 : MVT::v8f16;
21921
21922 // The mask constants are automatically splatted for vector types.
21923 unsigned EltSizeInBits = VT.getScalarSizeInBits();
21924 SDValue SignMask = DAG.getConstantFP(
21925 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
21926 SDValue MagMask = DAG.getConstantFP(
21927 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
21928
21929 // First, clear all bits but the sign bit from the second operand (sign).
21930 if (IsFakeVector)
21931 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
21932 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
21933
21934 // Next, clear the sign bit from the first operand (magnitude).
21935 // TODO: If we had general constant folding for FP logic ops, this check
21936 // wouldn't be necessary.
21937 SDValue MagBits;
21938 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
21939 APFloat APF = Op0CN->getValueAPF();
21940 APF.clearSign();
21941 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
21942 } else {
21943 // If the magnitude operand wasn't a constant, we need to AND out the sign.
21944 if (IsFakeVector)
21945 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
21946 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
21947 }
21948
21949 // OR the magnitude value with the sign bit.
21950 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
21951 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
21952 DAG.getIntPtrConstant(0, dl));
21953}
21954
21956 SDValue N0 = Op.getOperand(0);
21957 SDLoc dl(Op);
21958 MVT VT = Op.getSimpleValueType();
21959
21960 MVT OpVT = N0.getSimpleValueType();
21961 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
21962 "Unexpected type for FGETSIGN");
21963
21964 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
21965 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
21966 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
21967 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
21968 Res = DAG.getZExtOrTrunc(Res, dl, VT);
21969 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
21970 return Res;
21971}
21972
21973/// Helper for attempting to create a X86ISD::BT node.
21974static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
21975 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
21976 // instruction. Since the shift amount is in-range-or-undefined, we know
21977 // that doing a bittest on the i32 value is ok. We extend to i32 because
21978 // the encoding for the i16 version is larger than the i32 version.
21979 // Also promote i16 to i32 for performance / code size reason.
21980 if (Src.getValueType().getScalarSizeInBits() < 32)
21981 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
21982
21983 // No legal type found, give up.
21984 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
21985 return SDValue();
21986
21987 // See if we can use the 32-bit instruction instead of the 64-bit one for a
21988 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
21989 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
21990 // known to be zero.
21991 if (Src.getValueType() == MVT::i64 &&
21992 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
21993 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
21994
21995 // If the operand types disagree, extend the shift amount to match. Since
21996 // BT ignores high bits (like shifts) we can use anyextend.
21997 if (Src.getValueType() != BitNo.getValueType()) {
21998 // Peek through a mask/modulo operation.
21999 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22000 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22001 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22002 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22003 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22004 BitNo.getOperand(0)),
22005 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22006 BitNo.getOperand(1)));
22007 else
22008 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22009 }
22010
22011 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22012}
22013
22014/// Helper for creating a X86ISD::SETCC node.
22016 SelectionDAG &DAG) {
22017 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22018 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22019}
22020
22021/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22022/// recognizable memcmp expansion.
22023static bool isOrXorXorTree(SDValue X, bool Root = true) {
22024 if (X.getOpcode() == ISD::OR)
22025 return isOrXorXorTree(X.getOperand(0), false) &&
22026 isOrXorXorTree(X.getOperand(1), false);
22027 if (Root)
22028 return false;
22029 return X.getOpcode() == ISD::XOR;
22030}
22031
22032/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22033/// expansion.
22034template <typename F>
22036 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22037 SDValue Op0 = X.getOperand(0);
22038 SDValue Op1 = X.getOperand(1);
22039 if (X.getOpcode() == ISD::OR) {
22040 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22041 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22042 if (VecVT != CmpVT)
22043 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22044 if (HasPT)
22045 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22046 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22047 }
22048 if (X.getOpcode() == ISD::XOR) {
22049 SDValue A = SToV(Op0);
22050 SDValue B = SToV(Op1);
22051 if (VecVT != CmpVT)
22052 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22053 if (HasPT)
22054 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22055 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22056 }
22057 llvm_unreachable("Impossible");
22058}
22059
22060/// Try to map a 128-bit or larger integer comparison to vector instructions
22061/// before type legalization splits it up into chunks.
22064 const SDLoc &DL,
22065 SelectionDAG &DAG,
22066 const X86Subtarget &Subtarget) {
22067 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22068
22069 // We're looking for an oversized integer equality comparison.
22070 EVT OpVT = X.getValueType();
22071 unsigned OpSize = OpVT.getSizeInBits();
22072 if (!OpVT.isScalarInteger() || OpSize < 128)
22073 return SDValue();
22074
22075 // Ignore a comparison with zero because that gets special treatment in
22076 // EmitTest(). But make an exception for the special case of a pair of
22077 // logically-combined vector-sized operands compared to zero. This pattern may
22078 // be generated by the memcmp expansion pass with oversized integer compares
22079 // (see PR33325).
22080 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22081 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22082 return SDValue();
22083
22084 // Don't perform this combine if constructing the vector will be expensive.
22085 auto IsVectorBitCastCheap = [](SDValue X) {
22087 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22088 X.getOpcode() == ISD::LOAD;
22089 };
22090 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22091 !IsOrXorXorTreeCCZero)
22092 return SDValue();
22093
22094 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22095 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22096 // Otherwise use PCMPEQ (plus AND) and mask testing.
22097 bool NoImplicitFloatOps =
22099 Attribute::NoImplicitFloat);
22100 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22101 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22102 (OpSize == 256 && Subtarget.hasAVX()) ||
22103 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22104 bool HasPT = Subtarget.hasSSE41();
22105
22106 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22107 // vector registers are essentially free. (Technically, widening registers
22108 // prevents load folding, but the tradeoff is worth it.)
22109 bool PreferKOT = Subtarget.preferMaskRegisters();
22110 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22111
22112 EVT VecVT = MVT::v16i8;
22113 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22114 if (OpSize == 256) {
22115 VecVT = MVT::v32i8;
22116 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22117 }
22118 EVT CastVT = VecVT;
22119 bool NeedsAVX512FCast = false;
22120 if (OpSize == 512 || NeedZExt) {
22121 if (Subtarget.hasBWI()) {
22122 VecVT = MVT::v64i8;
22123 CmpVT = MVT::v64i1;
22124 if (OpSize == 512)
22125 CastVT = VecVT;
22126 } else {
22127 VecVT = MVT::v16i32;
22128 CmpVT = MVT::v16i1;
22129 CastVT = OpSize == 512 ? VecVT
22130 : OpSize == 256 ? MVT::v8i32
22131 : MVT::v4i32;
22132 NeedsAVX512FCast = true;
22133 }
22134 }
22135
22136 auto ScalarToVector = [&](SDValue X) -> SDValue {
22137 bool TmpZext = false;
22138 EVT TmpCastVT = CastVT;
22139 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22140 SDValue OrigX = X.getOperand(0);
22141 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22142 if (OrigSize < OpSize) {
22143 if (OrigSize == 128) {
22144 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22145 X = OrigX;
22146 TmpZext = true;
22147 } else if (OrigSize == 256) {
22148 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22149 X = OrigX;
22150 TmpZext = true;
22151 }
22152 }
22153 }
22154 X = DAG.getBitcast(TmpCastVT, X);
22155 if (!NeedZExt && !TmpZext)
22156 return X;
22157 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22158 DAG.getConstant(0, DL, VecVT), X,
22159 DAG.getVectorIdxConstant(0, DL));
22160 };
22161
22162 SDValue Cmp;
22163 if (IsOrXorXorTreeCCZero) {
22164 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22165 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22166 // Use 2 vector equality compares and 'and' the results before doing a
22167 // MOVMSK.
22168 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22169 } else {
22170 SDValue VecX = ScalarToVector(X);
22171 SDValue VecY = ScalarToVector(Y);
22172 if (VecVT != CmpVT) {
22173 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22174 } else if (HasPT) {
22175 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22176 } else {
22177 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22178 }
22179 }
22180 // AVX512 should emit a setcc that will lower to kortest.
22181 if (VecVT != CmpVT) {
22182 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22183 : CmpVT == MVT::v32i1 ? MVT::i32
22184 : MVT::i16;
22185 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22186 DAG.getConstant(0, DL, KRegVT), CC);
22187 }
22188 if (HasPT) {
22189 SDValue BCCmp =
22190 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22191 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22193 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22194 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22195 }
22196 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22197 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22198 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22199 assert(Cmp.getValueType() == MVT::v16i8 &&
22200 "Non 128-bit vector on pre-SSE41 target");
22201 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22202 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22203 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22204 }
22205
22206 return SDValue();
22207}
22208
22209/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22210/// style scalarized (associative) reduction patterns. Partial reductions
22211/// are supported when the pointer SrcMask is non-null.
22212/// TODO - move this to SelectionDAG?
22215 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22217 DenseMap<SDValue, APInt> SrcOpMap;
22218 EVT VT = MVT::Other;
22219
22220 // Recognize a special case where a vector is casted into wide integer to
22221 // test all 0s.
22222 assert(Op.getOpcode() == unsigned(BinOp) &&
22223 "Unexpected bit reduction opcode");
22224 Opnds.push_back(Op.getOperand(0));
22225 Opnds.push_back(Op.getOperand(1));
22226
22227 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22229 // BFS traverse all BinOp operands.
22230 if (I->getOpcode() == unsigned(BinOp)) {
22231 Opnds.push_back(I->getOperand(0));
22232 Opnds.push_back(I->getOperand(1));
22233 // Re-evaluate the number of nodes to be traversed.
22234 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22235 continue;
22236 }
22237
22238 // Quit if a non-EXTRACT_VECTOR_ELT
22239 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22240 return false;
22241
22242 // Quit if without a constant index.
22243 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22244 if (!Idx)
22245 return false;
22246
22247 SDValue Src = I->getOperand(0);
22248 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22249 if (M == SrcOpMap.end()) {
22250 VT = Src.getValueType();
22251 // Quit if not the same type.
22252 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22253 return false;
22254 unsigned NumElts = VT.getVectorNumElements();
22255 APInt EltCount = APInt::getZero(NumElts);
22256 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22257 SrcOps.push_back(Src);
22258 }
22259
22260 // Quit if element already used.
22261 unsigned CIdx = Idx->getZExtValue();
22262 if (M->second[CIdx])
22263 return false;
22264 M->second.setBit(CIdx);
22265 }
22266
22267 if (SrcMask) {
22268 // Collect the source partial masks.
22269 for (SDValue &SrcOp : SrcOps)
22270 SrcMask->push_back(SrcOpMap[SrcOp]);
22271 } else {
22272 // Quit if not all elements are used.
22273 for (const auto &I : SrcOpMap)
22274 if (!I.second.isAllOnes())
22275 return false;
22276 }
22277
22278 return true;
22279}
22280
22281// Helper function for comparing all bits of two vectors.
22283 ISD::CondCode CC, const APInt &OriginalMask,
22284 const X86Subtarget &Subtarget,
22285 SelectionDAG &DAG, X86::CondCode &X86CC) {
22286 EVT VT = LHS.getValueType();
22287 unsigned ScalarSize = VT.getScalarSizeInBits();
22288 if (OriginalMask.getBitWidth() != ScalarSize) {
22289 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22290 return SDValue();
22291 }
22292
22293 // Quit if not convertable to legal scalar or 128/256-bit vector.
22294 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22295 return SDValue();
22296
22297 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22298 if (VT.isFloatingPoint())
22299 return SDValue();
22300
22301 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22302 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22303
22304 APInt Mask = OriginalMask;
22305
22306 auto MaskBits = [&](SDValue Src) {
22307 if (Mask.isAllOnes())
22308 return Src;
22309 EVT SrcVT = Src.getValueType();
22310 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22311 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22312 };
22313
22314 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22315 if (VT.getSizeInBits() < 128) {
22316 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22317 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
22318 if (IntVT != MVT::i64)
22319 return SDValue();
22320 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
22321 MVT::i32, MVT::i32);
22322 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
22323 MVT::i32, MVT::i32);
22324 SDValue Lo =
22325 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
22326 SDValue Hi =
22327 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
22328 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22329 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
22330 DAG.getConstant(0, DL, MVT::i32));
22331 }
22332 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22333 DAG.getBitcast(IntVT, MaskBits(LHS)),
22334 DAG.getBitcast(IntVT, MaskBits(RHS)));
22335 }
22336
22337 // Without PTEST, a masked v2i64 or-reduction is not faster than
22338 // scalarization.
22339 bool UseKORTEST = Subtarget.useAVX512Regs();
22340 bool UsePTEST = Subtarget.hasSSE41();
22341 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
22342 return SDValue();
22343
22344 // Split down to 128/256/512-bit vector.
22345 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
22346
22347 // If the input vector has vector elements wider than the target test size,
22348 // then cast to <X x i64> so it will safely split.
22349 if (ScalarSize > TestSize) {
22350 if (!Mask.isAllOnes())
22351 return SDValue();
22352 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
22353 LHS = DAG.getBitcast(VT, LHS);
22354 RHS = DAG.getBitcast(VT, RHS);
22355 Mask = APInt::getAllOnes(64);
22356 }
22357
22358 if (VT.getSizeInBits() > TestSize) {
22359 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
22360 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
22361 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22362 while (VT.getSizeInBits() > TestSize) {
22363 auto Split = DAG.SplitVector(LHS, DL);
22364 VT = Split.first.getValueType();
22365 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22366 }
22367 RHS = DAG.getAllOnesConstant(DL, VT);
22368 } else if (!UsePTEST && !KnownRHS.isZero()) {
22369 // MOVMSK Special Case:
22370 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22371 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
22372 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
22373 LHS = DAG.getBitcast(VT, MaskBits(LHS));
22374 RHS = DAG.getBitcast(VT, MaskBits(RHS));
22375 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
22376 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
22377 V = DAG.getSExtOrTrunc(V, DL, VT);
22378 while (VT.getSizeInBits() > TestSize) {
22379 auto Split = DAG.SplitVector(V, DL);
22380 VT = Split.first.getValueType();
22381 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22382 }
22383 V = DAG.getNOT(DL, V, VT);
22384 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22385 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22386 DAG.getConstant(0, DL, MVT::i32));
22387 } else {
22388 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
22389 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
22390 while (VT.getSizeInBits() > TestSize) {
22391 auto Split = DAG.SplitVector(V, DL);
22392 VT = Split.first.getValueType();
22393 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22394 }
22395 LHS = V;
22396 RHS = DAG.getConstant(0, DL, VT);
22397 }
22398 }
22399
22400 if (UseKORTEST && VT.is512BitVector()) {
22401 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22402 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
22403 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22404 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22405 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
22406 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
22407 }
22408
22409 if (UsePTEST) {
22410 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
22411 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22412 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22413 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
22414 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22415 }
22416
22417 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22418 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
22419 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
22420 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
22421 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
22422 V = DAG.getNOT(DL, V, MaskVT);
22423 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22424 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22425 DAG.getConstant(0, DL, MVT::i32));
22426}
22427
22428// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22429// to CMP(MOVMSK(PCMPEQB(X,Y))).
22431 ISD::CondCode CC, const SDLoc &DL,
22432 const X86Subtarget &Subtarget,
22433 SelectionDAG &DAG,
22434 X86::CondCode &X86CC) {
22435 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22436
22437 bool CmpNull = isNullConstant(RHS);
22438 bool CmpAllOnes = isAllOnesConstant(RHS);
22439 if (!CmpNull && !CmpAllOnes)
22440 return SDValue();
22441
22442 SDValue Op = LHS;
22443 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22444 return SDValue();
22445
22446 // Check whether we're masking/truncating an OR-reduction result, in which
22447 // case track the masked bits.
22448 // TODO: Add CmpAllOnes support.
22449 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
22450 if (CmpNull) {
22451 switch (Op.getOpcode()) {
22452 case ISD::TRUNCATE: {
22453 SDValue Src = Op.getOperand(0);
22454 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22455 Op.getScalarValueSizeInBits());
22456 Op = Src;
22457 break;
22458 }
22459 case ISD::AND: {
22460 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22461 Mask = Cst->getAPIntValue();
22462 Op = Op.getOperand(0);
22463 }
22464 break;
22465 }
22466 }
22467 }
22468
22469 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
22470
22471 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
22472 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22474 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
22475 EVT VT = VecIns[0].getValueType();
22476 assert(llvm::all_of(VecIns,
22477 [VT](SDValue V) { return VT == V.getValueType(); }) &&
22478 "Reduction source vector mismatch");
22479
22480 // Quit if not splittable to scalar/128/256/512-bit vector.
22481 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22482 return SDValue();
22483
22484 // If more than one full vector is evaluated, AND/OR them first before
22485 // PTEST.
22486 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22487 Slot += 2, e += 1) {
22488 // Each iteration will AND/OR 2 nodes and append the result until there is
22489 // only 1 node left, i.e. the final value of all vectors.
22490 SDValue LHS = VecIns[Slot];
22491 SDValue RHS = VecIns[Slot + 1];
22492 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
22493 }
22494
22495 return LowerVectorAllEqual(DL, VecIns.back(),
22496 CmpNull ? DAG.getConstant(0, DL, VT)
22497 : DAG.getAllOnesConstant(DL, VT),
22498 CC, Mask, Subtarget, DAG, X86CC);
22499 }
22500
22501 // Match icmp(reduce_or(X),0) anyof reduction patterns.
22502 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22503 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22504 ISD::NodeType BinOp;
22505 if (SDValue Match =
22506 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
22507 EVT MatchVT = Match.getValueType();
22509 CmpNull ? DAG.getConstant(0, DL, MatchVT)
22510 : DAG.getAllOnesConstant(DL, MatchVT),
22511 CC, Mask, Subtarget, DAG, X86CC);
22512 }
22513 }
22514
22515 if (Mask.isAllOnes()) {
22516 assert(!Op.getValueType().isVector() &&
22517 "Illegal vector type for reduction pattern");
22519 if (Src.getValueType().isFixedLengthVector() &&
22520 Src.getValueType().getScalarType() == MVT::i1) {
22521 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
22522 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22523 if (Src.getOpcode() == ISD::SETCC) {
22524 SDValue LHS = Src.getOperand(0);
22525 SDValue RHS = Src.getOperand(1);
22526 EVT LHSVT = LHS.getValueType();
22527 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22528 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
22529 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
22530 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
22531 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
22532 X86CC);
22533 }
22534 }
22535 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
22536 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22537 // Peek through truncation, mask the LSB and compare against zero/LSB.
22538 if (Src.getOpcode() == ISD::TRUNCATE) {
22539 SDValue Inner = Src.getOperand(0);
22540 EVT InnerVT = Inner.getValueType();
22541 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
22542 unsigned BW = InnerVT.getScalarSizeInBits();
22543 APInt SrcMask = APInt(BW, 1);
22544 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
22545 return LowerVectorAllEqual(DL, Inner,
22546 DAG.getConstant(Cmp, DL, InnerVT), CC,
22547 SrcMask, Subtarget, DAG, X86CC);
22548 }
22549 }
22550 }
22551 }
22552
22553 return SDValue();
22554}
22555
22556/// return true if \c Op has a use that doesn't just read flags.
22558 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22559 ++UI) {
22560 SDNode *User = *UI;
22561 unsigned UOpNo = UI.getOperandNo();
22562 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22563 // Look pass truncate.
22564 UOpNo = User->use_begin().getOperandNo();
22565 User = *User->use_begin();
22566 }
22567
22568 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22569 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22570 return true;
22571 }
22572 return false;
22573}
22574
22575// Transform to an x86-specific ALU node with flags if there is a chance of
22576// using an RMW op or only the flags are used. Otherwise, leave
22577// the node alone and emit a 'cmp' or 'test' instruction.
22579 for (SDNode *U : Op->uses())
22580 if (U->getOpcode() != ISD::CopyToReg &&
22581 U->getOpcode() != ISD::SETCC &&
22582 U->getOpcode() != ISD::STORE)
22583 return false;
22584
22585 return true;
22586}
22587
22588/// Emit nodes that will be selected as "test Op0,Op0", or something
22589/// equivalent.
22590static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22591 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22592 // CF and OF aren't always set the way we want. Determine which
22593 // of these we need.
22594 bool NeedCF = false;
22595 bool NeedOF = false;
22596 switch (X86CC) {
22597 default: break;
22598 case X86::COND_A: case X86::COND_AE:
22599 case X86::COND_B: case X86::COND_BE:
22600 NeedCF = true;
22601 break;
22602 case X86::COND_G: case X86::COND_GE:
22603 case X86::COND_L: case X86::COND_LE:
22604 case X86::COND_O: case X86::COND_NO: {
22605 // Check if we really need to set the
22606 // Overflow flag. If NoSignedWrap is present
22607 // that is not actually needed.
22608 switch (Op->getOpcode()) {
22609 case ISD::ADD:
22610 case ISD::SUB:
22611 case ISD::MUL:
22612 case ISD::SHL:
22613 if (Op.getNode()->getFlags().hasNoSignedWrap())
22614 break;
22615 [[fallthrough]];
22616 default:
22617 NeedOF = true;
22618 break;
22619 }
22620 break;
22621 }
22622 }
22623 // See if we can use the EFLAGS value from the operand instead of
22624 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22625 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22626 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22627 // Emit a CMP with 0, which is the TEST pattern.
22628 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22629 DAG.getConstant(0, dl, Op.getValueType()));
22630 }
22631 unsigned Opcode = 0;
22632 unsigned NumOperands = 0;
22633
22634 SDValue ArithOp = Op;
22635
22636 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22637 // which may be the result of a CAST. We use the variable 'Op', which is the
22638 // non-casted variable when we check for possible users.
22639 switch (ArithOp.getOpcode()) {
22640 case ISD::AND:
22641 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22642 // because a TEST instruction will be better.
22643 if (!hasNonFlagsUse(Op))
22644 break;
22645
22646 [[fallthrough]];
22647 case ISD::ADD:
22648 case ISD::SUB:
22649 case ISD::OR:
22650 case ISD::XOR:
22652 break;
22653
22654 // Otherwise use a regular EFLAGS-setting instruction.
22655 switch (ArithOp.getOpcode()) {
22656 // clang-format off
22657 default: llvm_unreachable("unexpected operator!");
22658 case ISD::ADD: Opcode = X86ISD::ADD; break;
22659 case ISD::SUB: Opcode = X86ISD::SUB; break;
22660 case ISD::XOR: Opcode = X86ISD::XOR; break;
22661 case ISD::AND: Opcode = X86ISD::AND; break;
22662 case ISD::OR: Opcode = X86ISD::OR; break;
22663 // clang-format on
22664 }
22665
22666 NumOperands = 2;
22667 break;
22668 case X86ISD::ADD:
22669 case X86ISD::SUB:
22670 case X86ISD::OR:
22671 case X86ISD::XOR:
22672 case X86ISD::AND:
22673 return SDValue(Op.getNode(), 1);
22674 case ISD::SSUBO:
22675 case ISD::USUBO: {
22676 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22677 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22678 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22679 Op->getOperand(1)).getValue(1);
22680 }
22681 default:
22682 break;
22683 }
22684
22685 if (Opcode == 0) {
22686 // Emit a CMP with 0, which is the TEST pattern.
22687 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22688 DAG.getConstant(0, dl, Op.getValueType()));
22689 }
22690 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22691 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22692
22693 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22694 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22695 return SDValue(New.getNode(), 1);
22696}
22697
22698/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22699/// equivalent.
22700static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22701 const SDLoc &dl, SelectionDAG &DAG,
22702 const X86Subtarget &Subtarget) {
22703 if (isNullConstant(Op1))
22704 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22705
22706 EVT CmpVT = Op0.getValueType();
22707
22708 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22709 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22710
22711 // Only promote the compare up to I32 if it is a 16 bit operation
22712 // with an immediate. 16 bit immediates are to be avoided unless the target
22713 // isn't slowed down by length changing prefixes, we're optimizing for
22714 // codesize or the comparison is with a folded load.
22715 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
22716 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
22718 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
22719 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
22720 // Don't do this if the immediate can fit in 8-bits.
22721 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22722 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22723 unsigned ExtendOp =
22725 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22726 // For equality comparisons try to use SIGN_EXTEND if the input was
22727 // truncate from something with enough sign bits.
22728 if (Op0.getOpcode() == ISD::TRUNCATE) {
22729 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
22730 ExtendOp = ISD::SIGN_EXTEND;
22731 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22732 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
22733 ExtendOp = ISD::SIGN_EXTEND;
22734 }
22735 }
22736
22737 CmpVT = MVT::i32;
22738 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22739 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22740 }
22741 }
22742
22743 // Try to shrink i64 compares if the input has enough zero bits.
22744 // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
22745 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
22746 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22747 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
22748 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22749 CmpVT = MVT::i32;
22750 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22751 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22752 }
22753
22754 // 0-x == y --> x+y == 0
22755 // 0-x != y --> x+y != 0
22756 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22757 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22758 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22759 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22760 return Add.getValue(1);
22761 }
22762
22763 // x == 0-y --> x+y == 0
22764 // x != 0-y --> x+y != 0
22765 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22766 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22767 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22768 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22769 return Add.getValue(1);
22770 }
22771
22772 // Use SUB instead of CMP to enable CSE between SUB and CMP.
22773 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22774 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22775 return Sub.getValue(1);
22776}
22777
22779 EVT VT) const {
22780 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
22781}
22782
22783bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
22784 SDNode *N, SDValue, SDValue IntPow2) const {
22785 if (N->getOpcode() == ISD::FDIV)
22786 return true;
22787
22788 EVT FPVT = N->getValueType(0);
22789 EVT IntVT = IntPow2.getValueType();
22790
22791 // This indicates a non-free bitcast.
22792 // TODO: This is probably overly conservative as we will need to scale the
22793 // integer vector anyways for the int->fp cast.
22794 if (FPVT.isVector() &&
22795 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
22796 return false;
22797
22798 return true;
22799}
22800
22801/// Check if replacement of SQRT with RSQRT should be disabled.
22802bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22803 EVT VT = Op.getValueType();
22804
22805 // We don't need to replace SQRT with RSQRT for half type.
22806 if (VT.getScalarType() == MVT::f16)
22807 return true;
22808
22809 // We never want to use both SQRT and RSQRT instructions for the same input.
22810 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22811 return false;
22812
22813 if (VT.isVector())
22814 return Subtarget.hasFastVectorFSQRT();
22815 return Subtarget.hasFastScalarFSQRT();
22816}
22817
22818/// The minimum architected relative accuracy is 2^-12. We need one
22819/// Newton-Raphson step to have a good float result (24 bits of precision).
22820SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22821 SelectionDAG &DAG, int Enabled,
22822 int &RefinementSteps,
22823 bool &UseOneConstNR,
22824 bool Reciprocal) const {
22825 SDLoc DL(Op);
22826 EVT VT = Op.getValueType();
22827
22828 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22829 // It is likely not profitable to do this for f64 because a double-precision
22830 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22831 // instructions: convert to single, rsqrtss, convert back to double, refine
22832 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22833 // along with FMA, this could be a throughput win.
22834 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22835 // after legalize types.
22836 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22837 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22838 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22839 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22840 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22841 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22842 RefinementSteps = 1;
22843
22844 UseOneConstNR = false;
22845 // There is no FSQRT for 512-bits, but there is RSQRT14.
22846 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22847 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
22848 if (RefinementSteps == 0 && !Reciprocal)
22849 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
22850 return Estimate;
22851 }
22852
22853 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22854 Subtarget.hasFP16()) {
22855 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
22856 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22857 RefinementSteps = 0;
22858
22859 if (VT == MVT::f16) {
22860 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22861 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22862 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22863 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
22864 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22865 }
22866
22867 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
22868 }
22869 return SDValue();
22870}
22871
22872/// The minimum architected relative accuracy is 2^-12. We need one
22873/// Newton-Raphson step to have a good float result (24 bits of precision).
22874SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22875 int Enabled,
22876 int &RefinementSteps) const {
22877 SDLoc DL(Op);
22878 EVT VT = Op.getValueType();
22879
22880 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22881 // It is likely not profitable to do this for f64 because a double-precision
22882 // reciprocal estimate with refinement on x86 prior to FMA requires
22883 // 15 instructions: convert to single, rcpss, convert back to double, refine
22884 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22885 // along with FMA, this could be a throughput win.
22886
22887 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22888 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22889 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22890 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22891 // Enable estimate codegen with 1 refinement step for vector division.
22892 // Scalar division estimates are disabled because they break too much
22893 // real-world code. These defaults are intended to match GCC behavior.
22894 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22895 return SDValue();
22896
22897 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22898 RefinementSteps = 1;
22899
22900 // There is no FSQRT for 512-bits, but there is RCP14.
22901 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22902 return DAG.getNode(Opcode, DL, VT, Op);
22903 }
22904
22905 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22906 Subtarget.hasFP16()) {
22907 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22908 RefinementSteps = 0;
22909
22910 if (VT == MVT::f16) {
22911 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22912 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22913 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22914 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
22915 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22916 }
22917
22918 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
22919 }
22920 return SDValue();
22921}
22922
22923/// If we have at least two divisions that use the same divisor, convert to
22924/// multiplication by a reciprocal. This may need to be adjusted for a given
22925/// CPU if a division's cost is not at least twice the cost of a multiplication.
22926/// This is because we still need one division to calculate the reciprocal and
22927/// then we need two multiplies by that reciprocal as replacements for the
22928/// original divisions.
22929unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22930 return 2;
22931}
22932
22933SDValue
22934X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22935 SelectionDAG &DAG,
22936 SmallVectorImpl<SDNode *> &Created) const {
22938 if (isIntDivCheap(N->getValueType(0), Attr))
22939 return SDValue(N,0); // Lower SDIV as SDIV
22940
22941 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
22942 "Unexpected divisor!");
22943
22944 // Only perform this transform if CMOV is supported otherwise the select
22945 // below will become a branch.
22946 if (!Subtarget.canUseCMOV())
22947 return SDValue();
22948
22949 // fold (sdiv X, pow2)
22950 EVT VT = N->getValueType(0);
22951 // FIXME: Support i8.
22952 if (VT != MVT::i16 && VT != MVT::i32 &&
22953 !(Subtarget.is64Bit() && VT == MVT::i64))
22954 return SDValue();
22955
22956 // If the divisor is 2 or -2, the default expansion is better.
22957 if (Divisor == 2 ||
22958 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
22959 return SDValue();
22960
22961 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
22962}
22963
22964/// Result of 'and' is compared against zero. Change to a BT node if possible.
22965/// Returns the BT node and the condition code needed to use it.
22967 SelectionDAG &DAG, X86::CondCode &X86CC) {
22968 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22969 SDValue Op0 = And.getOperand(0);
22970 SDValue Op1 = And.getOperand(1);
22971 if (Op0.getOpcode() == ISD::TRUNCATE)
22972 Op0 = Op0.getOperand(0);
22973 if (Op1.getOpcode() == ISD::TRUNCATE)
22974 Op1 = Op1.getOperand(0);
22975
22976 SDValue Src, BitNo;
22977 if (Op1.getOpcode() == ISD::SHL)
22978 std::swap(Op0, Op1);
22979 if (Op0.getOpcode() == ISD::SHL) {
22980 if (isOneConstant(Op0.getOperand(0))) {
22981 // If we looked past a truncate, check that it's only truncating away
22982 // known zeros.
22983 unsigned BitWidth = Op0.getValueSizeInBits();
22984 unsigned AndBitWidth = And.getValueSizeInBits();
22985 if (BitWidth > AndBitWidth) {
22986 KnownBits Known = DAG.computeKnownBits(Op0);
22987 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22988 return SDValue();
22989 }
22990 Src = Op1;
22991 BitNo = Op0.getOperand(1);
22992 }
22993 } else if (Op1.getOpcode() == ISD::Constant) {
22994 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22995 uint64_t AndRHSVal = AndRHS->getZExtValue();
22996 SDValue AndLHS = Op0;
22997
22998 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22999 Src = AndLHS.getOperand(0);
23000 BitNo = AndLHS.getOperand(1);
23001 } else {
23002 // Use BT if the immediate can't be encoded in a TEST instruction or we
23003 // are optimizing for size and the immedaite won't fit in a byte.
23004 bool OptForSize = DAG.shouldOptForSize();
23005 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23006 isPowerOf2_64(AndRHSVal)) {
23007 Src = AndLHS;
23008 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23009 Src.getValueType());
23010 }
23011 }
23012 }
23013
23014 // No patterns found, give up.
23015 if (!Src.getNode())
23016 return SDValue();
23017
23018 // Remove any bit flip.
23019 if (isBitwiseNot(Src)) {
23020 Src = Src.getOperand(0);
23022 }
23023
23024 // Attempt to create the X86ISD::BT node.
23025 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23026 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23027 return BT;
23028 }
23029
23030 return SDValue();
23031}
23032
23033// Check if pre-AVX condcode can be performed by a single FCMP op.
23034static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23035 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23036}
23037
23038/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23039/// CMPs.
23040static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23041 SDValue &Op1, bool &IsAlwaysSignaling) {
23042 unsigned SSECC;
23043 bool Swap = false;
23044
23045 // SSE Condition code mapping:
23046 // 0 - EQ
23047 // 1 - LT
23048 // 2 - LE
23049 // 3 - UNORD
23050 // 4 - NEQ
23051 // 5 - NLT
23052 // 6 - NLE
23053 // 7 - ORD
23054 switch (SetCCOpcode) {
23055 // clang-format off
23056 default: llvm_unreachable("Unexpected SETCC condition");
23057 case ISD::SETOEQ:
23058 case ISD::SETEQ: SSECC = 0; break;
23059 case ISD::SETOGT:
23060 case ISD::SETGT: Swap = true; [[fallthrough]];
23061 case ISD::SETLT:
23062 case ISD::SETOLT: SSECC = 1; break;
23063 case ISD::SETOGE:
23064 case ISD::SETGE: Swap = true; [[fallthrough]];
23065 case ISD::SETLE:
23066 case ISD::SETOLE: SSECC = 2; break;
23067 case ISD::SETUO: SSECC = 3; break;
23068 case ISD::SETUNE:
23069 case ISD::SETNE: SSECC = 4; break;
23070 case ISD::SETULE: Swap = true; [[fallthrough]];
23071 case ISD::SETUGE: SSECC = 5; break;
23072 case ISD::SETULT: Swap = true; [[fallthrough]];
23073 case ISD::SETUGT: SSECC = 6; break;
23074 case ISD::SETO: SSECC = 7; break;
23075 case ISD::SETUEQ: SSECC = 8; break;
23076 case ISD::SETONE: SSECC = 12; break;
23077 // clang-format on
23078 }
23079 if (Swap)
23080 std::swap(Op0, Op1);
23081
23082 switch (SetCCOpcode) {
23083 default:
23084 IsAlwaysSignaling = true;
23085 break;
23086 case ISD::SETEQ:
23087 case ISD::SETOEQ:
23088 case ISD::SETUEQ:
23089 case ISD::SETNE:
23090 case ISD::SETONE:
23091 case ISD::SETUNE:
23092 case ISD::SETO:
23093 case ISD::SETUO:
23094 IsAlwaysSignaling = false;
23095 break;
23096 }
23097
23098 return SSECC;
23099}
23100
23101/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23102/// concatenate the result back.
23105 const SDLoc &dl) {
23106 assert(VT.isInteger() && VT == LHS.getValueType() &&
23107 VT == RHS.getValueType() && "Unsupported VTs!");
23108
23109 SDValue CC = DAG.getCondCode(Cond);
23110
23111 // Extract the LHS Lo/Hi vectors
23112 SDValue LHS1, LHS2;
23113 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23114
23115 // Extract the RHS Lo/Hi vectors
23116 SDValue RHS1, RHS2;
23117 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23118
23119 // Issue the operation on the smaller types and concatenate the result back
23120 EVT LoVT, HiVT;
23121 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23122 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23123 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23124 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23125}
23126
23128 SelectionDAG &DAG) {
23129 SDValue Op0 = Op.getOperand(0);
23130 SDValue Op1 = Op.getOperand(1);
23131 SDValue CC = Op.getOperand(2);
23132 MVT VT = Op.getSimpleValueType();
23133 assert(VT.getVectorElementType() == MVT::i1 &&
23134 "Cannot set masked compare for this operation");
23135
23136 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23137
23138 // Prefer SETGT over SETLT.
23139 if (SetCCOpcode == ISD::SETLT) {
23140 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23141 std::swap(Op0, Op1);
23142 }
23143
23144 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23145}
23146
23147/// Given a buildvector constant, return a new vector constant with each element
23148/// incremented or decremented. If incrementing or decrementing would result in
23149/// unsigned overflow or underflow or this is not a simple vector constant,
23150/// return an empty value.
23152 bool NSW) {
23153 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23154 if (!BV || !V.getValueType().isSimple())
23155 return SDValue();
23156
23157 MVT VT = V.getSimpleValueType();
23158 MVT EltVT = VT.getVectorElementType();
23159 unsigned NumElts = VT.getVectorNumElements();
23161 SDLoc DL(V);
23162 for (unsigned i = 0; i < NumElts; ++i) {
23163 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23164 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23165 return SDValue();
23166
23167 // Avoid overflow/underflow.
23168 const APInt &EltC = Elt->getAPIntValue();
23169 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23170 return SDValue();
23171 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23172 (!IsInc && EltC.isMinSignedValue())))
23173 return SDValue();
23174
23175 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23176 }
23177
23178 return DAG.getBuildVector(VT, DL, NewVecC);
23179}
23180
23181/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23182/// Op0 u<= Op1:
23183/// t = psubus Op0, Op1
23184/// pcmpeq t, <0..0>
23186 ISD::CondCode Cond, const SDLoc &dl,
23187 const X86Subtarget &Subtarget,
23188 SelectionDAG &DAG) {
23189 if (!Subtarget.hasSSE2())
23190 return SDValue();
23191
23192 MVT VET = VT.getVectorElementType();
23193 if (VET != MVT::i8 && VET != MVT::i16)
23194 return SDValue();
23195
23196 switch (Cond) {
23197 default:
23198 return SDValue();
23199 case ISD::SETULT: {
23200 // If the comparison is against a constant we can turn this into a
23201 // setule. With psubus, setule does not require a swap. This is
23202 // beneficial because the constant in the register is no longer
23203 // destructed as the destination so it can be hoisted out of a loop.
23204 // Only do this pre-AVX since vpcmp* is no longer destructive.
23205 if (Subtarget.hasAVX())
23206 return SDValue();
23207 SDValue ULEOp1 =
23208 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23209 if (!ULEOp1)
23210 return SDValue();
23211 Op1 = ULEOp1;
23212 break;
23213 }
23214 case ISD::SETUGT: {
23215 // If the comparison is against a constant, we can turn this into a setuge.
23216 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23217 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23218 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23219 SDValue UGEOp1 =
23220 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
23221 if (!UGEOp1)
23222 return SDValue();
23223 Op1 = Op0;
23224 Op0 = UGEOp1;
23225 break;
23226 }
23227 // Psubus is better than flip-sign because it requires no inversion.
23228 case ISD::SETUGE:
23229 std::swap(Op0, Op1);
23230 break;
23231 case ISD::SETULE:
23232 break;
23233 }
23234
23235 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23236 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23237 DAG.getConstant(0, dl, VT));
23238}
23239
23240static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23241 SelectionDAG &DAG) {
23242 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23243 Op.getOpcode() == ISD::STRICT_FSETCCS;
23244 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23245 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23246 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23247 MVT VT = Op->getSimpleValueType(0);
23248 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23249 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23250 SDLoc dl(Op);
23251
23252 if (isFP) {
23254 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
23255 if (isSoftF16(EltVT, Subtarget))
23256 return SDValue();
23257
23258 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23259 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23260
23261 // If we have a strict compare with a vXi1 result and the input is 128/256
23262 // bits we can't use a masked compare unless we have VLX. If we use a wider
23263 // compare like we do for non-strict, we might trigger spurious exceptions
23264 // from the upper elements. Instead emit a AVX compare and convert to mask.
23265 unsigned Opc;
23266 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23267 (!IsStrict || Subtarget.hasVLX() ||
23269#ifndef NDEBUG
23270 unsigned Num = VT.getVectorNumElements();
23271 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
23272#endif
23273 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23274 } else {
23275 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23276 // The SSE/AVX packed FP comparison nodes are defined with a
23277 // floating-point vector result that matches the operand type. This allows
23278 // them to work with an SSE1 target (integer vector types are not legal).
23279 VT = Op0.getSimpleValueType();
23280 }
23281
23282 SDValue Cmp;
23283 bool IsAlwaysSignaling;
23284 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23285 if (!Subtarget.hasAVX()) {
23286 // TODO: We could use following steps to handle a quiet compare with
23287 // signaling encodings.
23288 // 1. Get ordered masks from a quiet ISD::SETO
23289 // 2. Use the masks to mask potential unordered elements in operand A, B
23290 // 3. Get the compare results of masked A, B
23291 // 4. Calculating final result using the mask and result from 3
23292 // But currently, we just fall back to scalar operations.
23293 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23294 return SDValue();
23295
23296 // Insert an extra signaling instruction to raise exception.
23297 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23298 SDValue SignalCmp = DAG.getNode(
23299 Opc, dl, {VT, MVT::Other},
23300 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23301 // FIXME: It seems we need to update the flags of all new strict nodes.
23302 // Otherwise, mayRaiseFPException in MI will return false due to
23303 // NoFPExcept = false by default. However, I didn't find it in other
23304 // patches.
23305 SignalCmp->setFlags(Op->getFlags());
23306 Chain = SignalCmp.getValue(1);
23307 }
23308
23309 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23310 // emit two comparisons and a logic op to tie them together.
23311 if (!cheapX86FSETCC_SSE(Cond)) {
23312 // LLVM predicate is SETUEQ or SETONE.
23313 unsigned CC0, CC1;
23314 unsigned CombineOpc;
23315 if (Cond == ISD::SETUEQ) {
23316 CC0 = 3; // UNORD
23317 CC1 = 0; // EQ
23318 CombineOpc = X86ISD::FOR;
23319 } else {
23321 CC0 = 7; // ORD
23322 CC1 = 4; // NEQ
23323 CombineOpc = X86ISD::FAND;
23324 }
23325
23326 SDValue Cmp0, Cmp1;
23327 if (IsStrict) {
23328 Cmp0 = DAG.getNode(
23329 Opc, dl, {VT, MVT::Other},
23330 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23331 Cmp1 = DAG.getNode(
23332 Opc, dl, {VT, MVT::Other},
23333 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23334 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23335 Cmp1.getValue(1));
23336 } else {
23337 Cmp0 = DAG.getNode(
23338 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23339 Cmp1 = DAG.getNode(
23340 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23341 }
23342 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23343 } else {
23344 if (IsStrict) {
23345 Cmp = DAG.getNode(
23346 Opc, dl, {VT, MVT::Other},
23347 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23348 Chain = Cmp.getValue(1);
23349 } else
23350 Cmp = DAG.getNode(
23351 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23352 }
23353 } else {
23354 // Handle all other FP comparisons here.
23355 if (IsStrict) {
23356 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23357 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23358 Cmp = DAG.getNode(
23359 Opc, dl, {VT, MVT::Other},
23360 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23361 Chain = Cmp.getValue(1);
23362 } else
23363 Cmp = DAG.getNode(
23364 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23365 }
23366
23367 if (VT.getFixedSizeInBits() >
23368 Op.getSimpleValueType().getFixedSizeInBits()) {
23369 // We emitted a compare with an XMM/YMM result. Finish converting to a
23370 // mask register using a vptestm.
23372 Cmp = DAG.getBitcast(CastVT, Cmp);
23373 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23374 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23375 } else {
23376 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23377 // the result type of SETCC. The bitcast is expected to be optimized
23378 // away during combining/isel.
23379 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23380 }
23381
23382 if (IsStrict)
23383 return DAG.getMergeValues({Cmp, Chain}, dl);
23384
23385 return Cmp;
23386 }
23387
23388 assert(!IsStrict && "Strict SETCC only handles FP operands.");
23389
23390 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
23391 assert(VTOp0 == Op1.getSimpleValueType() &&
23392 "Expected operands with same type!");
23394 "Invalid number of packed elements for source and destination!");
23395
23396 // The non-AVX512 code below works under the assumption that source and
23397 // destination types are the same.
23398 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23399 "Value types for source and destination must be the same!");
23400
23401 // The result is boolean, but operands are int/float
23402 if (VT.getVectorElementType() == MVT::i1) {
23403 // In AVX-512 architecture setcc returns mask with i1 elements,
23404 // But there is no compare instruction for i8 and i16 elements in KNL.
23405 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23406 "Unexpected operand type");
23407 return LowerIntVSETCC_AVX512(Op, dl, DAG);
23408 }
23409
23410 // Lower using XOP integer comparisons.
23411 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23412 // Translate compare code to XOP PCOM compare mode.
23413 unsigned CmpMode = 0;
23414 switch (Cond) {
23415 // clang-format off
23416 default: llvm_unreachable("Unexpected SETCC condition");
23417 case ISD::SETULT:
23418 case ISD::SETLT: CmpMode = 0x00; break;
23419 case ISD::SETULE:
23420 case ISD::SETLE: CmpMode = 0x01; break;
23421 case ISD::SETUGT:
23422 case ISD::SETGT: CmpMode = 0x02; break;
23423 case ISD::SETUGE:
23424 case ISD::SETGE: CmpMode = 0x03; break;
23425 case ISD::SETEQ: CmpMode = 0x04; break;
23426 case ISD::SETNE: CmpMode = 0x05; break;
23427 // clang-format on
23428 }
23429
23430 // Are we comparing unsigned or signed integers?
23431 unsigned Opc =
23433
23434 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23435 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23436 }
23437
23438 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23439 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23441 SDValue BC0 = peekThroughBitcasts(Op0);
23442 if (BC0.getOpcode() == ISD::AND) {
23443 APInt UndefElts;
23444 SmallVector<APInt, 64> EltBits;
23446 BC0.getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits,
23447 /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) {
23448 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23449 Cond = ISD::SETEQ;
23450 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23451 }
23452 }
23453 }
23454 }
23455
23456 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23457 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23458 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23460 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23461 unsigned BitWidth = VT.getScalarSizeInBits();
23462 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23463
23464 SDValue Result = Op0.getOperand(0);
23465 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23466 DAG.getConstant(ShiftAmt, dl, VT));
23467 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23468 DAG.getConstant(BitWidth - 1, dl, VT));
23469 return Result;
23470 }
23471 }
23472
23473 // Break 256-bit integer vector compare into smaller ones.
23474 if (VT.is256BitVector() && !Subtarget.hasInt256())
23475 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23476
23477 // Break 512-bit integer vector compare into smaller ones.
23478 // TODO: Try harder to use VPCMPx + VPMOV2x?
23479 if (VT.is512BitVector())
23480 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23481
23482 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23483 // not-of-PCMPEQ:
23484 // X != INT_MIN --> X >s INT_MIN
23485 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23486 // +X != 0 --> +X >s 0
23487 APInt ConstValue;
23488 if (Cond == ISD::SETNE &&
23489 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23490 if (ConstValue.isMinSignedValue())
23491 Cond = ISD::SETGT;
23492 else if (ConstValue.isMaxSignedValue())
23493 Cond = ISD::SETLT;
23494 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
23495 Cond = ISD::SETGT;
23496 }
23497
23498 // If both operands are known non-negative, then an unsigned compare is the
23499 // same as a signed compare and there's no need to flip signbits.
23500 // TODO: We could check for more general simplifications here since we're
23501 // computing known bits.
23502 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23503 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23504
23505 // Special case: Use min/max operations for unsigned compares.
23506 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23508 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23509 TLI.isOperationLegal(ISD::UMIN, VT)) {
23510 // If we have a constant operand, increment/decrement it and change the
23511 // condition to avoid an invert.
23512 if (Cond == ISD::SETUGT) {
23513 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23514 if (SDValue UGTOp1 =
23515 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
23516 Op1 = UGTOp1;
23517 Cond = ISD::SETUGE;
23518 }
23519 }
23520 if (Cond == ISD::SETULT) {
23521 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23522 if (SDValue ULTOp1 =
23523 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
23524 Op1 = ULTOp1;
23525 Cond = ISD::SETULE;
23526 }
23527 }
23528 bool Invert = false;
23529 unsigned Opc;
23530 switch (Cond) {
23531 // clang-format off
23532 default: llvm_unreachable("Unexpected condition code");
23533 case ISD::SETUGT: Invert = true; [[fallthrough]];
23534 case ISD::SETULE: Opc = ISD::UMIN; break;
23535 case ISD::SETULT: Invert = true; [[fallthrough]];
23536 case ISD::SETUGE: Opc = ISD::UMAX; break;
23537 // clang-format on
23538 }
23539
23540 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23541 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23542
23543 // If the logical-not of the result is required, perform that now.
23544 if (Invert)
23545 Result = DAG.getNOT(dl, Result, VT);
23546
23547 return Result;
23548 }
23549
23550 // Try to use SUBUS and PCMPEQ.
23551 if (FlipSigns)
23552 if (SDValue V =
23553 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23554 return V;
23555
23556 // We are handling one of the integer comparisons here. Since SSE only has
23557 // GT and EQ comparisons for integer, swapping operands and multiple
23558 // operations may be required for some comparisons.
23559 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23561 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23563 bool Invert = Cond == ISD::SETNE ||
23565
23566 if (Swap)
23567 std::swap(Op0, Op1);
23568
23569 // Check that the operation in question is available (most are plain SSE2,
23570 // but PCMPGTQ and PCMPEQQ have different requirements).
23571 if (VT == MVT::v2i64) {
23572 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23573 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23574
23575 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23576 // the odd elements over the even elements.
23577 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23578 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23579 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23580
23581 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23582 static const int MaskHi[] = { 1, 1, 3, 3 };
23583 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23584
23585 return DAG.getBitcast(VT, Result);
23586 }
23587
23588 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23589 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23590 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23591
23592 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23593 static const int MaskHi[] = { 1, 1, 3, 3 };
23594 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23595
23596 return DAG.getBitcast(VT, Result);
23597 }
23598
23599 // If the i64 elements are sign-extended enough to be representable as i32
23600 // then we can compare the lower i32 bits and splat.
23601 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
23602 DAG.ComputeNumSignBits(Op1) > 32) {
23603 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23604 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23605
23606 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23607 static const int MaskLo[] = {0, 0, 2, 2};
23608 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23609
23610 return DAG.getBitcast(VT, Result);
23611 }
23612
23613 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23614 // bits of the inputs before performing those operations. The lower
23615 // compare is always unsigned.
23616 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
23617 : 0x0000000080000000ULL,
23618 dl, MVT::v2i64);
23619
23620 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23621 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23622
23623 // Cast everything to the right type.
23624 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23625 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23626
23627 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23628 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23629 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23630
23631 // Create masks for only the low parts/high parts of the 64 bit integers.
23632 static const int MaskHi[] = { 1, 1, 3, 3 };
23633 static const int MaskLo[] = { 0, 0, 2, 2 };
23634 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23635 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23636 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23637
23638 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23639 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23640
23641 if (Invert)
23642 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23643
23644 return DAG.getBitcast(VT, Result);
23645 }
23646
23647 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23648 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23649 // pcmpeqd + pshufd + pand.
23650 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23651
23652 // First cast everything to the right type.
23653 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23654 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23655
23656 // Do the compare.
23657 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23658
23659 // Make sure the lower and upper halves are both all-ones.
23660 static const int Mask[] = { 1, 0, 3, 2 };
23661 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23662 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23663
23664 if (Invert)
23665 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23666
23667 return DAG.getBitcast(VT, Result);
23668 }
23669 }
23670
23671 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23672 // bits of the inputs before performing those operations.
23673 if (FlipSigns) {
23674 MVT EltVT = VT.getVectorElementType();
23676 VT);
23677 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23678 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23679 }
23680
23681 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23682
23683 // If the logical-not of the result is required, perform that now.
23684 if (Invert)
23685 Result = DAG.getNOT(dl, Result, VT);
23686
23687 return Result;
23688}
23689
23690// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23692 const SDLoc &dl, SelectionDAG &DAG,
23693 const X86Subtarget &Subtarget,
23694 SDValue &X86CC) {
23695 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23696
23697 // Must be a bitcast from vXi1.
23698 if (Op0.getOpcode() != ISD::BITCAST)
23699 return SDValue();
23700
23701 Op0 = Op0.getOperand(0);
23702 MVT VT = Op0.getSimpleValueType();
23703 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23704 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23705 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23706 return SDValue();
23707
23708 X86::CondCode X86Cond;
23709 if (isNullConstant(Op1)) {
23710 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23711 } else if (isAllOnesConstant(Op1)) {
23712 // C flag is set for all ones.
23713 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23714 } else
23715 return SDValue();
23716
23717 // If the input is an AND, we can combine it's operands into the KTEST.
23718 bool KTestable = false;
23719 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23720 KTestable = true;
23721 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23722 KTestable = true;
23723 if (!isNullConstant(Op1))
23724 KTestable = false;
23725 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23726 SDValue LHS = Op0.getOperand(0);
23727 SDValue RHS = Op0.getOperand(1);
23728 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23729 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23730 }
23731
23732 // If the input is an OR, we can combine it's operands into the KORTEST.
23733 SDValue LHS = Op0;
23734 SDValue RHS = Op0;
23735 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23736 LHS = Op0.getOperand(0);
23737 RHS = Op0.getOperand(1);
23738 }
23739
23740 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23741 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23742}
23743
23744/// Emit flags for the given setcc condition and operands. Also returns the
23745/// corresponding X86 condition code constant in X86CC.
23746SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23747 ISD::CondCode CC, const SDLoc &dl,
23748 SelectionDAG &DAG,
23749 SDValue &X86CC) const {
23750 // Equality Combines.
23751 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23752 X86::CondCode X86CondCode;
23753
23754 // Optimize to BT if possible.
23755 // Lower (X & (1 << N)) == 0 to BT(X, N).
23756 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23757 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23758 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
23759 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
23760 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23761 return BT;
23762 }
23763 }
23764
23765 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
23766 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
23767 X86CondCode)) {
23768 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23769 return CmpZ;
23770 }
23771
23772 // Try to lower using KORTEST or KTEST.
23773 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23774 return Test;
23775
23776 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
23777 // of these.
23778 if (isOneConstant(Op1) || isNullConstant(Op1)) {
23779 // If the input is a setcc, then reuse the input setcc or use a new one
23780 // with the inverted condition.
23781 if (Op0.getOpcode() == X86ISD::SETCC) {
23782 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23783
23784 X86CC = Op0.getOperand(0);
23785 if (Invert) {
23786 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23787 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
23788 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23789 }
23790
23791 return Op0.getOperand(1);
23792 }
23793 }
23794
23795 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
23796 // overflow.
23797 if (isMinSignedConstant(Op1)) {
23798 EVT VT = Op0.getValueType();
23799 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
23800 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
23802 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23803 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
23804 DAG.getConstant(0, dl, VT), Op0);
23805 return SDValue(Neg.getNode(), 1);
23806 }
23807 }
23808
23809 // Try to use the carry flag from the add in place of an separate CMP for:
23810 // (seteq (add X, -1), -1). Similar for setne.
23811 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23812 Op0.getOperand(1) == Op1) {
23813 if (isProfitableToUseFlagOp(Op0)) {
23814 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23815
23816 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23817 Op0.getOperand(1));
23818 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23819 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23820 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23821 return SDValue(New.getNode(), 1);
23822 }
23823 }
23824 }
23825
23827 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23828 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23829
23830 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23831 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23832 return EFLAGS;
23833}
23834
23835SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23836
23837 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23838 Op.getOpcode() == ISD::STRICT_FSETCCS;
23839 MVT VT = Op->getSimpleValueType(0);
23840
23841 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23842
23843 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23844 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23845 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23846 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23847 SDLoc dl(Op);
23849 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23850
23851 if (isSoftF16(Op0.getValueType(), Subtarget))
23852 return SDValue();
23853
23854 // Handle f128 first, since one possible outcome is a normal integer
23855 // comparison which gets handled by emitFlagsForSetcc.
23856 if (Op0.getValueType() == MVT::f128) {
23857 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23858 Op.getOpcode() == ISD::STRICT_FSETCCS);
23859
23860 // If softenSetCCOperands returned a scalar, use it.
23861 if (!Op1.getNode()) {
23862 assert(Op0.getValueType() == Op.getValueType() &&
23863 "Unexpected setcc expansion!");
23864 if (IsStrict)
23865 return DAG.getMergeValues({Op0, Chain}, dl);
23866 return Op0;
23867 }
23868 }
23869
23870 if (Op0.getSimpleValueType().isInteger()) {
23871 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23872 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23873 // this may translate to less uops depending on uarch implementation. The
23874 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23875 // canonicalize to that CondCode.
23876 // NOTE: Only do this if incrementing the constant doesn't increase the bit
23877 // encoding size - so it must either already be a i8 or i32 immediate, or it
23878 // shrinks down to that. We don't do this for any i64's to avoid additional
23879 // constant materializations.
23880 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23881 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23882 const APInt &Op1Val = Op1C->getAPIntValue();
23883 if (!Op1Val.isZero()) {
23884 // Ensure the constant+1 doesn't overflow.
23885 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23886 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23887 APInt Op1ValPlusOne = Op1Val + 1;
23888 if (Op1ValPlusOne.isSignedIntN(32) &&
23889 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23890 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23893 }
23894 }
23895 }
23896 }
23897
23898 SDValue X86CC;
23899 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23900 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23901 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23902 }
23903
23904 // Handle floating point.
23905 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23906 if (CondCode == X86::COND_INVALID)
23907 return SDValue();
23908
23909 SDValue EFLAGS;
23910 if (IsStrict) {
23911 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23912 EFLAGS =
23914 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23915 Chain = EFLAGS.getValue(1);
23916 } else {
23917 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23918 }
23919
23920 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23921 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23922 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23923}
23924
23925SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23926 SDValue LHS = Op.getOperand(0);
23927 SDValue RHS = Op.getOperand(1);
23928 SDValue Carry = Op.getOperand(2);
23929 SDValue Cond = Op.getOperand(3);
23930 SDLoc DL(Op);
23931
23932 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23933 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23934
23935 // Recreate the carry if needed.
23936 EVT CarryVT = Carry.getValueType();
23937 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23938 Carry, DAG.getAllOnesConstant(DL, CarryVT));
23939
23940 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23941 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23942 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23943}
23944
23945// This function returns three things: the arithmetic computation itself
23946// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
23947// flag and the condition code define the case in which the arithmetic
23948// computation overflows.
23949static std::pair<SDValue, SDValue>
23951 assert(Op.getResNo() == 0 && "Unexpected result number!");
23952 SDValue Value, Overflow;
23953 SDValue LHS = Op.getOperand(0);
23954 SDValue RHS = Op.getOperand(1);
23955 unsigned BaseOp = 0;
23956 SDLoc DL(Op);
23957 switch (Op.getOpcode()) {
23958 default: llvm_unreachable("Unknown ovf instruction!");
23959 case ISD::SADDO:
23960 BaseOp = X86ISD::ADD;
23961 Cond = X86::COND_O;
23962 break;
23963 case ISD::UADDO:
23964 BaseOp = X86ISD::ADD;
23966 break;
23967 case ISD::SSUBO:
23968 BaseOp = X86ISD::SUB;
23969 Cond = X86::COND_O;
23970 break;
23971 case ISD::USUBO:
23972 BaseOp = X86ISD::SUB;
23973 Cond = X86::COND_B;
23974 break;
23975 case ISD::SMULO:
23976 BaseOp = X86ISD::SMUL;
23977 Cond = X86::COND_O;
23978 break;
23979 case ISD::UMULO:
23980 BaseOp = X86ISD::UMUL;
23981 Cond = X86::COND_O;
23982 break;
23983 }
23984
23985 if (BaseOp) {
23986 // Also sets EFLAGS.
23987 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23988 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23989 Overflow = Value.getValue(1);
23990 }
23991
23992 return std::make_pair(Value, Overflow);
23993}
23994
23996 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23997 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23998 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23999 // has only one use.
24000 SDLoc DL(Op);
24002 SDValue Value, Overflow;
24003 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24004
24005 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24006 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24007 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24008}
24009
24010/// Return true if opcode is a X86 logical comparison.
24012 unsigned Opc = Op.getOpcode();
24013 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24014 Opc == X86ISD::FCMP)
24015 return true;
24016 if (Op.getResNo() == 1 &&
24017 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24018 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
24019 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24020 return true;
24021
24022 return false;
24023}
24024
24026 if (V.getOpcode() != ISD::TRUNCATE)
24027 return false;
24028
24029 SDValue VOp0 = V.getOperand(0);
24030 unsigned InBits = VOp0.getValueSizeInBits();
24031 unsigned Bits = V.getValueSizeInBits();
24032 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24033}
24034
24035SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24036 bool AddTest = true;
24037 SDValue Cond = Op.getOperand(0);
24038 SDValue Op1 = Op.getOperand(1);
24039 SDValue Op2 = Op.getOperand(2);
24040 SDLoc DL(Op);
24041 MVT VT = Op1.getSimpleValueType();
24042 SDValue CC;
24043
24044 if (isSoftF16(VT, Subtarget)) {
24045 MVT NVT = VT.changeTypeToInteger();
24046 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24047 DAG.getBitcast(NVT, Op1),
24048 DAG.getBitcast(NVT, Op2)));
24049 }
24050
24051 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24052 // are available or VBLENDV if AVX is available.
24053 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24054 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24055 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24056 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24057 bool IsAlwaysSignaling;
24058 unsigned SSECC =
24059 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24060 CondOp0, CondOp1, IsAlwaysSignaling);
24061
24062 if (Subtarget.hasAVX512()) {
24063 SDValue Cmp =
24064 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24065 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24066 assert(!VT.isVector() && "Not a scalar type?");
24067 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24068 }
24069
24070 if (SSECC < 8 || Subtarget.hasAVX()) {
24071 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24072 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24073
24074 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24075 // of 3 logic instructions for size savings and potentially speed.
24076 // Unfortunately, there is no scalar form of VBLENDV.
24077
24078 // If either operand is a +0.0 constant, don't try this. We can expect to
24079 // optimize away at least one of the logic instructions later in that
24080 // case, so that sequence would be faster than a variable blend.
24081
24082 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24083 // uses XMM0 as the selection register. That may need just as many
24084 // instructions as the AND/ANDN/OR sequence due to register moves, so
24085 // don't bother.
24086 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24087 !isNullFPConstant(Op2)) {
24088 // Convert to vectors, do a VSELECT, and convert back to scalar.
24089 // All of the conversions should be optimized away.
24090 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24091 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24092 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24093 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24094
24095 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24096 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24097
24098 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24099
24100 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
24101 VSel, DAG.getIntPtrConstant(0, DL));
24102 }
24103 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24104 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24105 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24106 }
24107 }
24108
24109 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24110 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24111 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24112 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24113 }
24114
24115 if (Cond.getOpcode() == ISD::SETCC &&
24116 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
24117 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24118 Cond = NewCond;
24119 // If the condition was updated, it's possible that the operands of the
24120 // select were also updated (for example, EmitTest has a RAUW). Refresh
24121 // the local references to the select operands in case they got stale.
24122 Op1 = Op.getOperand(1);
24123 Op2 = Op.getOperand(2);
24124 }
24125 }
24126
24127 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24128 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24129 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24130 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24131 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24132 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24133 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24134 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24135 if (Cond.getOpcode() == X86ISD::SETCC &&
24136 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24137 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24138 SDValue Cmp = Cond.getOperand(1);
24139 SDValue CmpOp0 = Cmp.getOperand(0);
24140 unsigned CondCode = Cond.getConstantOperandVal(0);
24141
24142 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24143 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24144 // handle to keep the CMP with 0. This should be removed by
24145 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24146 // cttz_zero_undef.
24147 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24148 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24149 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24150 };
24151 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
24152 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24153 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24154 // Keep Cmp.
24155 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24156 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
24157 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
24158 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
24159
24160 // 'X - 1' sets the carry flag if X == 0.
24161 // '0 - X' sets the carry flag if X != 0.
24162 // Convert the carry flag to a -1/0 mask with sbb:
24163 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24164 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24165 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24166 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24167 SDValue Sub;
24168 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
24169 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
24170 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
24171 } else {
24172 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
24173 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
24174 }
24176 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24177 Sub.getValue(1));
24178 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24179 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
24180 CmpOp0.getOpcode() == ISD::AND &&
24181 isOneConstant(CmpOp0.getOperand(1))) {
24182 SDValue Src1, Src2;
24183 // true if Op2 is XOR or OR operator and one of its operands
24184 // is equal to Op1
24185 // ( a , a op b) || ( b , a op b)
24186 auto isOrXorPattern = [&]() {
24187 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
24188 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
24189 Src1 =
24190 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
24191 Src2 = Op1;
24192 return true;
24193 }
24194 return false;
24195 };
24196
24197 if (isOrXorPattern()) {
24198 SDValue Neg;
24199 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
24200 // we need mask of all zeros or ones with same size of the other
24201 // operands.
24202 if (CmpSz > VT.getSizeInBits())
24203 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
24204 else if (CmpSz < VT.getSizeInBits())
24205 Neg = DAG.getNode(ISD::AND, DL, VT,
24206 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
24207 DAG.getConstant(1, DL, VT));
24208 else
24209 Neg = CmpOp0;
24210 SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
24211 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
24212 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
24213 }
24214 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
24215 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24216 ((CondCode == X86::COND_S) || // smin(x, 0)
24217 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
24218 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24219 //
24220 // If the comparison is testing for a positive value, we have to invert
24221 // the sign bit mask, so only do that transform if the target has a
24222 // bitwise 'and not' instruction (the invert is free).
24223 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24224 unsigned ShCt = VT.getSizeInBits() - 1;
24225 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
24226 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
24227 if (CondCode == X86::COND_G)
24228 Shift = DAG.getNOT(DL, Shift, VT);
24229 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
24230 }
24231 }
24232
24233 // Look past (and (setcc_carry (cmp ...)), 1).
24234 if (Cond.getOpcode() == ISD::AND &&
24235 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24236 isOneConstant(Cond.getOperand(1)))
24237 Cond = Cond.getOperand(0);
24238
24239 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24240 // setting operand in place of the X86ISD::SETCC.
24241 unsigned CondOpcode = Cond.getOpcode();
24242 if (CondOpcode == X86ISD::SETCC ||
24243 CondOpcode == X86ISD::SETCC_CARRY) {
24244 CC = Cond.getOperand(0);
24245
24246 SDValue Cmp = Cond.getOperand(1);
24247 bool IllegalFPCMov = false;
24248 if (VT.isFloatingPoint() && !VT.isVector() &&
24249 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
24250 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24251
24252 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24253 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24254 Cond = Cmp;
24255 AddTest = false;
24256 }
24257 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24258 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24259 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24260 SDValue Value;
24261 X86::CondCode X86Cond;
24262 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24263
24264 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24265 AddTest = false;
24266 }
24267
24268 if (AddTest) {
24269 // Look past the truncate if the high bits are known zero.
24271 Cond = Cond.getOperand(0);
24272
24273 // We know the result of AND is compared against zero. Try to match
24274 // it to BT.
24275 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24276 X86::CondCode X86CondCode;
24277 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
24278 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
24279 Cond = BT;
24280 AddTest = false;
24281 }
24282 }
24283 }
24284
24285 if (AddTest) {
24286 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24287 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24288 }
24289
24290 // a < b ? -1 : 0 -> RES = ~setcc_carry
24291 // a < b ? 0 : -1 -> RES = setcc_carry
24292 // a >= b ? -1 : 0 -> RES = setcc_carry
24293 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24294 if (Cond.getOpcode() == X86ISD::SUB) {
24295 unsigned CondCode = CC->getAsZExtVal();
24296
24297 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24298 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24299 (isNullConstant(Op1) || isNullConstant(Op2))) {
24300 SDValue Res =
24301 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24302 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24303 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24304 return DAG.getNOT(DL, Res, Res.getValueType());
24305 return Res;
24306 }
24307 }
24308
24309 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24310 // widen the cmov and push the truncate through. This avoids introducing a new
24311 // branch during isel and doesn't add any extensions.
24312 if (Op.getValueType() == MVT::i8 &&
24313 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24314 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24315 if (T1.getValueType() == T2.getValueType() &&
24316 // Exclude CopyFromReg to avoid partial register stalls.
24317 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24318 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24319 CC, Cond);
24320 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24321 }
24322 }
24323
24324 // Or finally, promote i8 cmovs if we have CMOV,
24325 // or i16 cmovs if it won't prevent folding a load.
24326 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24327 // legal, but EmitLoweredSelect() can not deal with these extensions
24328 // being inserted between two CMOV's. (in i16 case too TBN)
24329 // https://bugs.llvm.org/show_bug.cgi?id=40974
24330 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
24331 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24332 !X86::mayFoldLoad(Op2, Subtarget))) {
24333 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24334 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24335 SDValue Ops[] = { Op2, Op1, CC, Cond };
24336 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24337 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24338 }
24339
24340 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24341 // condition is true.
24342 SDValue Ops[] = { Op2, Op1, CC, Cond };
24343 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24344}
24345
24347 const X86Subtarget &Subtarget,
24348 SelectionDAG &DAG) {
24349 MVT VT = Op->getSimpleValueType(0);
24350 SDValue In = Op->getOperand(0);
24351 MVT InVT = In.getSimpleValueType();
24352 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24353 MVT VTElt = VT.getVectorElementType();
24354 unsigned NumElts = VT.getVectorNumElements();
24355
24356 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24357 MVT ExtVT = VT;
24358 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24359 // If v16i32 is to be avoided, we'll need to split and concatenate.
24360 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24361 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24362
24363 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24364 }
24365
24366 // Widen to 512-bits if VLX is not supported.
24367 MVT WideVT = ExtVT;
24368 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24369 NumElts *= 512 / ExtVT.getSizeInBits();
24370 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24371 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24372 In, DAG.getIntPtrConstant(0, dl));
24373 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24374 }
24375
24376 SDValue V;
24377 MVT WideEltVT = WideVT.getVectorElementType();
24378 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24379 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24380 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24381 } else {
24382 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24383 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24384 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24385 }
24386
24387 // Truncate if we had to extend i16/i8 above.
24388 if (VT != ExtVT) {
24389 WideVT = MVT::getVectorVT(VTElt, NumElts);
24390 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24391 }
24392
24393 // Extract back to 128/256-bit if we widened.
24394 if (WideVT != VT)
24395 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24396 DAG.getIntPtrConstant(0, dl));
24397
24398 return V;
24399}
24400
24402 SelectionDAG &DAG) {
24403 SDValue In = Op->getOperand(0);
24404 MVT InVT = In.getSimpleValueType();
24405 SDLoc DL(Op);
24406
24407 if (InVT.getVectorElementType() == MVT::i1)
24408 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
24409
24410 assert(Subtarget.hasAVX() && "Expected AVX support");
24411 return LowerAVXExtend(Op, DL, DAG, Subtarget);
24412}
24413
24414// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24415// For sign extend this needs to handle all vector sizes and SSE4.1 and
24416// non-SSE4.1 targets. For zero extend this should only handle inputs of
24417// MVT::v64i8 when BWI is not supported, but AVX512 is.
24419 const X86Subtarget &Subtarget,
24420 SelectionDAG &DAG) {
24421 SDValue In = Op->getOperand(0);
24422 MVT VT = Op->getSimpleValueType(0);
24423 MVT InVT = In.getSimpleValueType();
24424
24425 MVT SVT = VT.getVectorElementType();
24426 MVT InSVT = InVT.getVectorElementType();
24428
24429 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24430 return SDValue();
24431 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24432 return SDValue();
24433 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24434 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24435 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24436 return SDValue();
24437
24438 SDLoc dl(Op);
24439 unsigned Opc = Op.getOpcode();
24440 unsigned NumElts = VT.getVectorNumElements();
24441
24442 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24443 // For 512-bit vectors, we need 128-bits or 256-bits.
24444 if (InVT.getSizeInBits() > 128) {
24445 // Input needs to be at least the same number of elements as output, and
24446 // at least 128-bits.
24447 int InSize = InSVT.getSizeInBits() * NumElts;
24448 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24449 InVT = In.getSimpleValueType();
24450 }
24451
24452 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24453 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24454 // need to be handled here for 256/512-bit results.
24455 if (Subtarget.hasInt256()) {
24456 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24457
24458 if (InVT.getVectorNumElements() != NumElts)
24459 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24460
24461 // FIXME: Apparently we create inreg operations that could be regular
24462 // extends.
24463 unsigned ExtOpc =
24466 return DAG.getNode(ExtOpc, dl, VT, In);
24467 }
24468
24469 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24470 if (Subtarget.hasAVX()) {
24471 assert(VT.is256BitVector() && "256-bit vector expected");
24472 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24473 int HalfNumElts = HalfVT.getVectorNumElements();
24474
24475 unsigned NumSrcElts = InVT.getVectorNumElements();
24476 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24477 for (int i = 0; i != HalfNumElts; ++i)
24478 HiMask[i] = HalfNumElts + i;
24479
24480 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24481 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24482 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24483 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24484 }
24485
24486 // We should only get here for sign extend.
24487 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24488 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24489 unsigned InNumElts = InVT.getVectorNumElements();
24490
24491 // If the source elements are already all-signbits, we don't need to extend,
24492 // just splat the elements.
24493 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
24494 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
24495 unsigned Scale = InNumElts / NumElts;
24496 SmallVector<int, 16> ShuffleMask;
24497 for (unsigned I = 0; I != NumElts; ++I)
24498 ShuffleMask.append(Scale, I);
24499 return DAG.getBitcast(VT,
24500 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
24501 }
24502
24503 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24504 SDValue Curr = In;
24505 SDValue SignExt = Curr;
24506
24507 // As SRAI is only available on i16/i32 types, we expand only up to i32
24508 // and handle i64 separately.
24509 if (InVT != MVT::v4i32) {
24510 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24511
24512 unsigned DestWidth = DestVT.getScalarSizeInBits();
24513 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24514 unsigned DestElts = DestVT.getVectorNumElements();
24515
24516 // Build a shuffle mask that takes each input element and places it in the
24517 // MSBs of the new element size.
24518 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24519 for (unsigned i = 0; i != DestElts; ++i)
24520 Mask[i * Scale + (Scale - 1)] = i;
24521
24522 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24523 Curr = DAG.getBitcast(DestVT, Curr);
24524
24525 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24526 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24527 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24528 }
24529
24530 if (VT == MVT::v2i64) {
24531 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24532 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24533 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24534 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24535 SignExt = DAG.getBitcast(VT, SignExt);
24536 }
24537
24538 return SignExt;
24539}
24540
24542 SelectionDAG &DAG) {
24543 MVT VT = Op->getSimpleValueType(0);
24544 SDValue In = Op->getOperand(0);
24545 MVT InVT = In.getSimpleValueType();
24546 SDLoc dl(Op);
24547
24548 if (InVT.getVectorElementType() == MVT::i1)
24549 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
24550
24551 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24553 "Expected same number of elements");
24554 assert((VT.getVectorElementType() == MVT::i16 ||
24555 VT.getVectorElementType() == MVT::i32 ||
24556 VT.getVectorElementType() == MVT::i64) &&
24557 "Unexpected element type");
24558 assert((InVT.getVectorElementType() == MVT::i8 ||
24559 InVT.getVectorElementType() == MVT::i16 ||
24560 InVT.getVectorElementType() == MVT::i32) &&
24561 "Unexpected element type");
24562
24563 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24564 assert(InVT == MVT::v32i8 && "Unexpected VT!");
24565 return splitVectorIntUnary(Op, DAG, dl);
24566 }
24567
24568 if (Subtarget.hasInt256())
24569 return Op;
24570
24571 // Optimize vectors in AVX mode
24572 // Sign extend v8i16 to v8i32 and
24573 // v4i32 to v4i64
24574 //
24575 // Divide input vector into two parts
24576 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24577 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24578 // concat the vectors to original VT
24579 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24580 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24581
24582 unsigned NumElems = InVT.getVectorNumElements();
24583 SmallVector<int,8> ShufMask(NumElems, -1);
24584 for (unsigned i = 0; i != NumElems/2; ++i)
24585 ShufMask[i] = i + NumElems/2;
24586
24587 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24588 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24589
24590 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24591}
24592
24593/// Change a vector store into a pair of half-size vector stores.
24595 SDValue StoredVal = Store->getValue();
24596 assert((StoredVal.getValueType().is256BitVector() ||
24597 StoredVal.getValueType().is512BitVector()) &&
24598 "Expecting 256/512-bit op");
24599
24600 // Splitting volatile memory ops is not allowed unless the operation was not
24601 // legal to begin with. Assume the input store is legal (this transform is
24602 // only used for targets with AVX). Note: It is possible that we have an
24603 // illegal type like v2i128, and so we could allow splitting a volatile store
24604 // in that case if that is important.
24605 if (!Store->isSimple())
24606 return SDValue();
24607
24608 SDLoc DL(Store);
24609 SDValue Value0, Value1;
24610 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24611 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24612 SDValue Ptr0 = Store->getBasePtr();
24613 SDValue Ptr1 =
24614 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
24615 SDValue Ch0 =
24616 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24617 Store->getOriginalAlign(),
24618 Store->getMemOperand()->getFlags());
24619 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24620 Store->getPointerInfo().getWithOffset(HalfOffset),
24621 Store->getOriginalAlign(),
24622 Store->getMemOperand()->getFlags());
24623 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24624}
24625
24626/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24627/// type.
24629 SelectionDAG &DAG) {
24630 SDValue StoredVal = Store->getValue();
24631 assert(StoreVT.is128BitVector() &&
24632 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24633 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24634
24635 // Splitting volatile memory ops is not allowed unless the operation was not
24636 // legal to begin with. We are assuming the input op is legal (this transform
24637 // is only used for targets with AVX).
24638 if (!Store->isSimple())
24639 return SDValue();
24640
24641 MVT StoreSVT = StoreVT.getScalarType();
24642 unsigned NumElems = StoreVT.getVectorNumElements();
24643 unsigned ScalarSize = StoreSVT.getStoreSize();
24644
24645 SDLoc DL(Store);
24647 for (unsigned i = 0; i != NumElems; ++i) {
24648 unsigned Offset = i * ScalarSize;
24649 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24651 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24652 DAG.getIntPtrConstant(i, DL));
24653 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24654 Store->getPointerInfo().getWithOffset(Offset),
24655 Store->getOriginalAlign(),
24656 Store->getMemOperand()->getFlags());
24657 Stores.push_back(Ch);
24658 }
24659 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24660}
24661
24662static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24663 SelectionDAG &DAG) {
24664 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24665 SDLoc dl(St);
24666 SDValue StoredVal = St->getValue();
24667
24668 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24669 if (StoredVal.getValueType().isVector() &&
24670 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24671 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24672 assert(NumElts <= 8 && "Unexpected VT");
24673 assert(!St->isTruncatingStore() && "Expected non-truncating store");
24674 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24675 "Expected AVX512F without AVX512DQI");
24676
24677 // We must pad with zeros to ensure we store zeroes to any unused bits.
24678 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24679 DAG.getUNDEF(MVT::v16i1), StoredVal,
24680 DAG.getIntPtrConstant(0, dl));
24681 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24682 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24683 // Make sure we store zeros in the extra bits.
24684 if (NumElts < 8)
24685 StoredVal = DAG.getZeroExtendInReg(
24686 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24687
24688 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24689 St->getPointerInfo(), St->getOriginalAlign(),
24690 St->getMemOperand()->getFlags());
24691 }
24692
24693 if (St->isTruncatingStore())
24694 return SDValue();
24695
24696 // If this is a 256-bit store of concatenated ops, we are better off splitting
24697 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24698 // and each half can execute independently. Some cores would split the op into
24699 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24700 MVT StoreVT = StoredVal.getSimpleValueType();
24701 if (StoreVT.is256BitVector() ||
24702 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24703 !Subtarget.hasBWI())) {
24704 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
24705 return splitVectorStore(St, DAG);
24706 return SDValue();
24707 }
24708
24709 if (StoreVT.is32BitVector())
24710 return SDValue();
24711
24712 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24713 assert(StoreVT.is64BitVector() && "Unexpected VT");
24714 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24716 "Unexpected type action!");
24717
24718 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24719 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24720 DAG.getUNDEF(StoreVT));
24721
24722 if (Subtarget.hasSSE2()) {
24723 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24724 // and store it.
24725 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24726 MVT CastVT = MVT::getVectorVT(StVT, 2);
24727 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24728 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24729 DAG.getIntPtrConstant(0, dl));
24730
24731 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24732 St->getPointerInfo(), St->getOriginalAlign(),
24733 St->getMemOperand()->getFlags());
24734 }
24735 assert(Subtarget.hasSSE1() && "Expected SSE");
24736 SDVTList Tys = DAG.getVTList(MVT::Other);
24737 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24738 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24739 St->getMemOperand());
24740}
24741
24742// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24743// may emit an illegal shuffle but the expansion is still better than scalar
24744// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24745// we'll emit a shuffle and a arithmetic shift.
24746// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24747// TODO: It is possible to support ZExt by zeroing the undef values during
24748// the shuffle phase or after the shuffle.
24749static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24750 SelectionDAG &DAG) {
24751 MVT RegVT = Op.getSimpleValueType();
24752 assert(RegVT.isVector() && "We only custom lower vector loads.");
24753 assert(RegVT.isInteger() &&
24754 "We only custom lower integer vector loads.");
24755
24756 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24757 SDLoc dl(Ld);
24758
24759 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24760 if (RegVT.getVectorElementType() == MVT::i1) {
24761 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24762 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24763 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24764 "Expected AVX512F without AVX512DQI");
24765
24766 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24767 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24768 Ld->getMemOperand()->getFlags());
24769
24770 // Replace chain users with the new chain.
24771 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24772
24773 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24774 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24775 DAG.getBitcast(MVT::v16i1, Val),
24776 DAG.getIntPtrConstant(0, dl));
24777 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24778 }
24779
24780 return SDValue();
24781}
24782
24783/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24784/// each of which has no other use apart from the AND / OR.
24785static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24786 Opc = Op.getOpcode();
24787 if (Opc != ISD::OR && Opc != ISD::AND)
24788 return false;
24789 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24790 Op.getOperand(0).hasOneUse() &&
24791 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24792 Op.getOperand(1).hasOneUse());
24793}
24794
24795SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24796 SDValue Chain = Op.getOperand(0);
24797 SDValue Cond = Op.getOperand(1);
24798 SDValue Dest = Op.getOperand(2);
24799 SDLoc dl(Op);
24800
24801 // Bail out when we don't have native compare instructions.
24802 if (Cond.getOpcode() == ISD::SETCC &&
24803 Cond.getOperand(0).getValueType() != MVT::f128 &&
24804 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
24805 SDValue LHS = Cond.getOperand(0);
24806 SDValue RHS = Cond.getOperand(1);
24807 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24808
24809 // Special case for
24810 // setcc([su]{add,sub,mul}o == 0)
24811 // setcc([su]{add,sub,mul}o != 1)
24812 if (ISD::isOverflowIntrOpRes(LHS) &&
24813 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24814 (isNullConstant(RHS) || isOneConstant(RHS))) {
24815 SDValue Value, Overflow;
24816 X86::CondCode X86Cond;
24817 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24818
24819 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24820 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24821
24822 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24823 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24824 Overflow);
24825 }
24826
24827 if (LHS.getSimpleValueType().isInteger()) {
24828 SDValue CCVal;
24829 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24830 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24831 EFLAGS);
24832 }
24833
24834 if (CC == ISD::SETOEQ) {
24835 // For FCMP_OEQ, we can emit
24836 // two branches instead of an explicit AND instruction with a
24837 // separate test. However, we only do this if this block doesn't
24838 // have a fall-through edge, because this requires an explicit
24839 // jmp when the condition is false.
24840 if (Op.getNode()->hasOneUse()) {
24841 SDNode *User = *Op.getNode()->use_begin();
24842 // Look for an unconditional branch following this conditional branch.
24843 // We need this because we need to reverse the successors in order
24844 // to implement FCMP_OEQ.
24845 if (User->getOpcode() == ISD::BR) {
24846 SDValue FalseBB = User->getOperand(1);
24847 SDNode *NewBR =
24848 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24849 assert(NewBR == User);
24850 (void)NewBR;
24851 Dest = FalseBB;
24852
24853 SDValue Cmp =
24854 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24855 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24856 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24857 CCVal, Cmp);
24858 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24859 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24860 Cmp);
24861 }
24862 }
24863 } else if (CC == ISD::SETUNE) {
24864 // For FCMP_UNE, we can emit
24865 // two branches instead of an explicit OR instruction with a
24866 // separate test.
24867 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24868 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24869 Chain =
24870 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24871 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24872 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24873 Cmp);
24874 } else {
24875 X86::CondCode X86Cond =
24876 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24877 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24878 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24879 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24880 Cmp);
24881 }
24882 }
24883
24885 SDValue Value, Overflow;
24886 X86::CondCode X86Cond;
24887 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24888
24889 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24890 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24891 Overflow);
24892 }
24893
24894 // Look past the truncate if the high bits are known zero.
24896 Cond = Cond.getOperand(0);
24897
24898 EVT CondVT = Cond.getValueType();
24899
24900 // Add an AND with 1 if we don't already have one.
24901 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24902 Cond =
24903 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24904
24905 SDValue LHS = Cond;
24906 SDValue RHS = DAG.getConstant(0, dl, CondVT);
24907
24908 SDValue CCVal;
24909 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24910 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24911 EFLAGS);
24912}
24913
24914// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24915// Calls to _alloca are needed to probe the stack when allocating more than 4k
24916// bytes in one go. Touching the stack at 4K increments is necessary to ensure
24917// that the guard pages used by the OS virtual memory manager are allocated in
24918// correct sequence.
24919SDValue
24920X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24921 SelectionDAG &DAG) const {
24923 bool SplitStack = MF.shouldSplitStack();
24924 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24925 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24926 SplitStack || EmitStackProbeCall;
24927 SDLoc dl(Op);
24928
24929 // Get the inputs.
24930 SDNode *Node = Op.getNode();
24931 SDValue Chain = Op.getOperand(0);
24932 SDValue Size = Op.getOperand(1);
24933 MaybeAlign Alignment(Op.getConstantOperandVal(2));
24934 EVT VT = Node->getValueType(0);
24935
24936 // Chain the dynamic stack allocation so that it doesn't modify the stack
24937 // pointer when other instructions are using the stack.
24938 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24939
24940 bool Is64Bit = Subtarget.is64Bit();
24941 MVT SPTy = getPointerTy(DAG.getDataLayout());
24942
24944 if (!Lower) {
24945 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24947 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24948 " not tell us which reg is the stack pointer!");
24949
24950 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24951 const Align StackAlign = TFI.getStackAlign();
24952 if (hasInlineStackProbe(MF)) {
24954
24955 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24956 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24957 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24958 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24959 DAG.getRegister(Vreg, SPTy));
24960 } else {
24961 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24962 Chain = SP.getValue(1);
24963 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24964 }
24965 if (Alignment && *Alignment > StackAlign)
24966 Result =
24967 DAG.getNode(ISD::AND, dl, VT, Result,
24968 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24969 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24970 } else if (SplitStack) {
24972
24973 if (Is64Bit) {
24974 // The 64 bit implementation of segmented stacks needs to clobber both r10
24975 // r11. This makes it impossible to use it along with nested parameters.
24976 const Function &F = MF.getFunction();
24977 for (const auto &A : F.args()) {
24978 if (A.hasNestAttr())
24979 report_fatal_error("Cannot use segmented stacks with functions that "
24980 "have nested arguments.");
24981 }
24982 }
24983
24984 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24985 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24986 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24987 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24988 DAG.getRegister(Vreg, SPTy));
24989 } else {
24990 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24991 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
24992 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
24993
24994 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24995 Register SPReg = RegInfo->getStackRegister();
24996 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24997 Chain = SP.getValue(1);
24998
24999 if (Alignment) {
25000 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
25001 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25002 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25003 }
25004
25005 Result = SP;
25006 }
25007
25008 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25009
25010 SDValue Ops[2] = {Result, Chain};
25011 return DAG.getMergeValues(Ops, dl);
25012}
25013
25014SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25016 auto PtrVT = getPointerTy(MF.getDataLayout());
25018
25019 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25020 SDLoc DL(Op);
25021
25022 if (!Subtarget.is64Bit() ||
25023 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25024 // vastart just stores the address of the VarArgsFrameIndex slot into the
25025 // memory location argument.
25026 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25027 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
25028 MachinePointerInfo(SV));
25029 }
25030
25031 // __va_list_tag:
25032 // gp_offset (0 - 6 * 8)
25033 // fp_offset (48 - 48 + 8 * 16)
25034 // overflow_arg_area (point to parameters coming in memory).
25035 // reg_save_area
25037 SDValue FIN = Op.getOperand(1);
25038 // Store gp_offset
25039 SDValue Store = DAG.getStore(
25040 Op.getOperand(0), DL,
25041 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25042 MachinePointerInfo(SV));
25043 MemOps.push_back(Store);
25044
25045 // Store fp_offset
25046 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25047 Store = DAG.getStore(
25048 Op.getOperand(0), DL,
25049 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25050 MachinePointerInfo(SV, 4));
25051 MemOps.push_back(Store);
25052
25053 // Store ptr to overflow_arg_area
25054 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25055 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25056 Store =
25057 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25058 MemOps.push_back(Store);
25059
25060 // Store ptr to reg_save_area.
25061 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25062 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25063 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25064 Store = DAG.getStore(
25065 Op.getOperand(0), DL, RSFIN, FIN,
25066 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25067 MemOps.push_back(Store);
25068 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25069}
25070
25071SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25072 assert(Subtarget.is64Bit() &&
25073 "LowerVAARG only handles 64-bit va_arg!");
25074 assert(Op.getNumOperands() == 4);
25075
25077 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25078 // The Win64 ABI uses char* instead of a structure.
25079 return DAG.expandVAArg(Op.getNode());
25080
25081 SDValue Chain = Op.getOperand(0);
25082 SDValue SrcPtr = Op.getOperand(1);
25083 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25084 unsigned Align = Op.getConstantOperandVal(3);
25085 SDLoc dl(Op);
25086
25087 EVT ArgVT = Op.getNode()->getValueType(0);
25088 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25089 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25090 uint8_t ArgMode;
25091
25092 // Decide which area this value should be read from.
25093 // TODO: Implement the AMD64 ABI in its entirety. This simple
25094 // selection mechanism works only for the basic types.
25095 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25096 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25097 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25098 } else {
25099 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25100 "Unhandled argument type in LowerVAARG");
25101 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25102 }
25103
25104 if (ArgMode == 2) {
25105 // Make sure using fp_offset makes sense.
25106 assert(!Subtarget.useSoftFloat() &&
25107 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25108 Subtarget.hasSSE1());
25109 }
25110
25111 // Insert VAARG node into the DAG
25112 // VAARG returns two values: Variable Argument Address, Chain
25113 SDValue InstOps[] = {Chain, SrcPtr,
25114 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25115 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25116 DAG.getTargetConstant(Align, dl, MVT::i32)};
25117 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25120 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25121 /*Alignment=*/std::nullopt,
25123 Chain = VAARG.getValue(1);
25124
25125 // Load the next argument and return it
25126 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25127}
25128
25129static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25130 SelectionDAG &DAG) {
25131 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25132 // where a va_list is still an i8*.
25133 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25134 if (Subtarget.isCallingConvWin64(
25136 // Probably a Win64 va_copy.
25137 return DAG.expandVACopy(Op.getNode());
25138
25139 SDValue Chain = Op.getOperand(0);
25140 SDValue DstPtr = Op.getOperand(1);
25141 SDValue SrcPtr = Op.getOperand(2);
25142 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25143 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25144 SDLoc DL(Op);
25145
25146 return DAG.getMemcpy(
25147 Chain, DL, DstPtr, SrcPtr,
25148 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25149 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25150 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
25151 MachinePointerInfo(SrcSV));
25152}
25153
25154// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25155static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25156 switch (Opc) {
25157 case ISD::SHL:
25158 case X86ISD::VSHL:
25159 case X86ISD::VSHLI:
25160 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25161 case ISD::SRL:
25162 case X86ISD::VSRL:
25163 case X86ISD::VSRLI:
25164 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25165 case ISD::SRA:
25166 case X86ISD::VSRA:
25167 case X86ISD::VSRAI:
25168 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25169 }
25170 llvm_unreachable("Unknown target vector shift node");
25171}
25172
25173/// Handle vector element shifts where the shift amount is a constant.
25174/// Takes immediate version of shift as input.
25175static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25176 SDValue SrcOp, uint64_t ShiftAmt,
25177 SelectionDAG &DAG) {
25178 MVT ElementType = VT.getVectorElementType();
25179
25180 // Bitcast the source vector to the output type, this is mainly necessary for
25181 // vXi8/vXi64 shifts.
25182 if (VT != SrcOp.getSimpleValueType())
25183 SrcOp = DAG.getBitcast(VT, SrcOp);
25184
25185 // Fold this packed shift into its first operand if ShiftAmt is 0.
25186 if (ShiftAmt == 0)
25187 return SrcOp;
25188
25189 // Check for ShiftAmt >= element width
25190 if (ShiftAmt >= ElementType.getSizeInBits()) {
25191 if (Opc == X86ISD::VSRAI)
25192 ShiftAmt = ElementType.getSizeInBits() - 1;
25193 else
25194 return DAG.getConstant(0, dl, VT);
25195 }
25196
25197 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
25198 && "Unknown target vector shift-by-constant node");
25199
25200 // Fold this packed vector shift into a build vector if SrcOp is a
25201 // vector of Constants or UNDEFs.
25203 unsigned ShiftOpc;
25204 switch (Opc) {
25205 default: llvm_unreachable("Unknown opcode!");
25206 case X86ISD::VSHLI:
25207 ShiftOpc = ISD::SHL;
25208 break;
25209 case X86ISD::VSRLI:
25210 ShiftOpc = ISD::SRL;
25211 break;
25212 case X86ISD::VSRAI:
25213 ShiftOpc = ISD::SRA;
25214 break;
25215 }
25216
25217 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
25218 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
25219 return C;
25220 }
25221
25222 return DAG.getNode(Opc, dl, VT, SrcOp,
25223 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25224}
25225
25226/// Handle vector element shifts by a splat shift amount
25227static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25228 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
25229 const X86Subtarget &Subtarget,
25230 SelectionDAG &DAG) {
25231 MVT AmtVT = ShAmt.getSimpleValueType();
25232 assert(AmtVT.isVector() && "Vector shift type mismatch");
25233 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
25234 "Illegal vector splat index");
25235
25236 // Move the splat element to the bottom element.
25237 if (ShAmtIdx != 0) {
25238 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25239 Mask[0] = ShAmtIdx;
25240 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
25241 }
25242
25243 // Peek through any zext node if we can get back to a 128-bit source.
25244 if (AmtVT.getScalarSizeInBits() == 64 &&
25245 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
25247 ShAmt.getOperand(0).getValueType().isSimple() &&
25248 ShAmt.getOperand(0).getValueType().is128BitVector()) {
25249 ShAmt = ShAmt.getOperand(0);
25250 AmtVT = ShAmt.getSimpleValueType();
25251 }
25252
25253 // See if we can mask off the upper elements using the existing source node.
25254 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25255 // do this for vXi64 types.
25256 bool IsMasked = false;
25257 if (AmtVT.getScalarSizeInBits() < 64) {
25258 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
25259 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
25260 // If the shift amount has come from a scalar, then zero-extend the scalar
25261 // before moving to the vector.
25262 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
25263 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25264 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
25265 AmtVT = MVT::v4i32;
25266 IsMasked = true;
25267 } else if (ShAmt.getOpcode() == ISD::AND) {
25268 // See if the shift amount is already masked (e.g. for rotation modulo),
25269 // then we can zero-extend it by setting all the other mask elements to
25270 // zero.
25271 SmallVector<SDValue> MaskElts(
25272 AmtVT.getVectorNumElements(),
25273 DAG.getConstant(0, dl, AmtVT.getScalarType()));
25274 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
25275 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
25276 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
25277 {ShAmt.getOperand(1), Mask}))) {
25278 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
25279 IsMasked = true;
25280 }
25281 }
25282 }
25283
25284 // Extract if the shift amount vector is larger than 128-bits.
25285 if (AmtVT.getSizeInBits() > 128) {
25286 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
25287 AmtVT = ShAmt.getSimpleValueType();
25288 }
25289
25290 // Zero-extend bottom element to v2i64 vector type, either by extension or
25291 // shuffle masking.
25292 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
25293 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
25294 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
25295 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25296 } else if (Subtarget.hasSSE41()) {
25297 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25298 MVT::v2i64, ShAmt);
25299 } else {
25300 SDValue ByteShift = DAG.getTargetConstant(
25301 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25302 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25303 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25304 ByteShift);
25305 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25306 ByteShift);
25307 }
25308 }
25309
25310 // Change opcode to non-immediate version.
25311 Opc = getTargetVShiftUniformOpcode(Opc, true);
25312
25313 // The return type has to be a 128-bit type with the same element
25314 // type as the input type.
25315 MVT EltVT = VT.getVectorElementType();
25316 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25317
25318 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25319 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25320}
25321
25322/// Return Mask with the necessary casting or extending
25323/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25324static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25325 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25326 const SDLoc &dl) {
25327
25328 if (isAllOnesConstant(Mask))
25329 return DAG.getConstant(1, dl, MaskVT);
25330 if (X86::isZeroNode(Mask))
25331 return DAG.getConstant(0, dl, MaskVT);
25332
25333 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25334
25335 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25336 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25337 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25338 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25339 SDValue Lo, Hi;
25340 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
25341 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25342 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25343 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25344 } else {
25345 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25346 Mask.getSimpleValueType().getSizeInBits());
25347 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25348 // are extracted by EXTRACT_SUBVECTOR.
25349 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25350 DAG.getBitcast(BitcastVT, Mask),
25351 DAG.getIntPtrConstant(0, dl));
25352 }
25353}
25354
25355/// Return (and \p Op, \p Mask) for compare instructions or
25356/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25357/// necessary casting or extending for \p Mask when lowering masking intrinsics
25359 SDValue PreservedSrc,
25360 const X86Subtarget &Subtarget,
25361 SelectionDAG &DAG) {
25362 MVT VT = Op.getSimpleValueType();
25363 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25364 unsigned OpcodeSelect = ISD::VSELECT;
25365 SDLoc dl(Op);
25366
25367 if (isAllOnesConstant(Mask))
25368 return Op;
25369
25370 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25371
25372 if (PreservedSrc.isUndef())
25373 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25374 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25375}
25376
25377/// Creates an SDNode for a predicated scalar operation.
25378/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25379/// The mask is coming as MVT::i8 and it should be transformed
25380/// to MVT::v1i1 while lowering masking intrinsics.
25381/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25382/// "X86select" instead of "vselect". We just can't create the "vselect" node
25383/// for a scalar instruction.
25385 SDValue PreservedSrc,
25386 const X86Subtarget &Subtarget,
25387 SelectionDAG &DAG) {
25388
25389 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25390 if (MaskConst->getZExtValue() & 0x1)
25391 return Op;
25392
25393 MVT VT = Op.getSimpleValueType();
25394 SDLoc dl(Op);
25395
25396 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25397 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25398 DAG.getBitcast(MVT::v8i1, Mask),
25399 DAG.getIntPtrConstant(0, dl));
25400 if (Op.getOpcode() == X86ISD::FSETCCM ||
25401 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25402 Op.getOpcode() == X86ISD::VFPCLASSS)
25403 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25404
25405 if (PreservedSrc.isUndef())
25406 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25407 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25408}
25409
25411 if (!Fn->hasPersonalityFn())
25413 "querying registration node size for function without personality");
25414 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25415 // WinEHStatePass for the full struct definition.
25416 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25417 case EHPersonality::MSVC_X86SEH: return 24;
25418 case EHPersonality::MSVC_CXX: return 16;
25419 default: break;
25420 }
25422 "can only recover FP for 32-bit MSVC EH personality functions");
25423}
25424
25425/// When the MSVC runtime transfers control to us, either to an outlined
25426/// function or when returning to a parent frame after catching an exception, we
25427/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25428/// Here's the math:
25429/// RegNodeBase = EntryEBP - RegNodeSize
25430/// ParentFP = RegNodeBase - ParentFrameOffset
25431/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25432/// subtracting the offset (negative on x86) takes us back to the parent FP.
25434 SDValue EntryEBP) {
25436 SDLoc dl;
25437
25438 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25439 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25440
25441 // It's possible that the parent function no longer has a personality function
25442 // if the exceptional code was optimized away, in which case we just return
25443 // the incoming EBP.
25444 if (!Fn->hasPersonalityFn())
25445 return EntryEBP;
25446
25447 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25448 // registration, or the .set_setframe offset.
25451 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25452 SDValue ParentFrameOffset =
25453 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25454
25455 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25456 // prologue to RBP in the parent function.
25457 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
25458 if (Subtarget.is64Bit())
25459 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25460
25461 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25462 // RegNodeBase = EntryEBP - RegNodeSize
25463 // ParentFP = RegNodeBase - ParentFrameOffset
25464 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25465 DAG.getConstant(RegNodeSize, dl, PtrVT));
25466 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25467}
25468
25469SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25470 SelectionDAG &DAG) const {
25471 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25472 auto isRoundModeCurDirection = [](SDValue Rnd) {
25473 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25474 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25475
25476 return false;
25477 };
25478 auto isRoundModeSAE = [](SDValue Rnd) {
25479 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25480 unsigned RC = C->getZExtValue();
25482 // Clear the NO_EXC bit and check remaining bits.
25484 // As a convenience we allow no other bits or explicitly
25485 // current direction.
25486 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25487 }
25488 }
25489
25490 return false;
25491 };
25492 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25493 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25494 RC = C->getZExtValue();
25496 // Clear the NO_EXC bit and check remaining bits.
25502 }
25503 }
25504
25505 return false;
25506 };
25507
25508 SDLoc dl(Op);
25509 unsigned IntNo = Op.getConstantOperandVal(0);
25510 MVT VT = Op.getSimpleValueType();
25511 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25512
25513 // Propagate flags from original node to transformed node(s).
25514 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25515
25516 if (IntrData) {
25517 switch(IntrData->Type) {
25518 case INTR_TYPE_1OP: {
25519 // We specify 2 possible opcodes for intrinsics with rounding modes.
25520 // First, we check if the intrinsic may have non-default rounding mode,
25521 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25522 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25523 if (IntrWithRoundingModeOpcode != 0) {
25524 SDValue Rnd = Op.getOperand(2);
25525 unsigned RC = 0;
25526 if (isRoundModeSAEToX(Rnd, RC))
25527 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25528 Op.getOperand(1),
25529 DAG.getTargetConstant(RC, dl, MVT::i32));
25530 if (!isRoundModeCurDirection(Rnd))
25531 return SDValue();
25532 }
25533 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25534 Op.getOperand(1));
25535 }
25536 case INTR_TYPE_1OP_SAE: {
25537 SDValue Sae = Op.getOperand(2);
25538
25539 unsigned Opc;
25540 if (isRoundModeCurDirection(Sae))
25541 Opc = IntrData->Opc0;
25542 else if (isRoundModeSAE(Sae))
25543 Opc = IntrData->Opc1;
25544 else
25545 return SDValue();
25546
25547 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25548 }
25549 case INTR_TYPE_2OP: {
25550 SDValue Src2 = Op.getOperand(2);
25551
25552 // We specify 2 possible opcodes for intrinsics with rounding modes.
25553 // First, we check if the intrinsic may have non-default rounding mode,
25554 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25555 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25556 if (IntrWithRoundingModeOpcode != 0) {
25557 SDValue Rnd = Op.getOperand(3);
25558 unsigned RC = 0;
25559 if (isRoundModeSAEToX(Rnd, RC))
25560 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25561 Op.getOperand(1), Src2,
25562 DAG.getTargetConstant(RC, dl, MVT::i32));
25563 if (!isRoundModeCurDirection(Rnd))
25564 return SDValue();
25565 }
25566
25567 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25568 Op.getOperand(1), Src2);
25569 }
25570 case INTR_TYPE_2OP_SAE: {
25571 SDValue Sae = Op.getOperand(3);
25572
25573 unsigned Opc;
25574 if (isRoundModeCurDirection(Sae))
25575 Opc = IntrData->Opc0;
25576 else if (isRoundModeSAE(Sae))
25577 Opc = IntrData->Opc1;
25578 else
25579 return SDValue();
25580
25581 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25582 Op.getOperand(2));
25583 }
25584 case INTR_TYPE_3OP:
25585 case INTR_TYPE_3OP_IMM8: {
25586 SDValue Src1 = Op.getOperand(1);
25587 SDValue Src2 = Op.getOperand(2);
25588 SDValue Src3 = Op.getOperand(3);
25589
25590 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25591 Src3.getValueType() != MVT::i8) {
25592 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
25593 }
25594
25595 // We specify 2 possible opcodes for intrinsics with rounding modes.
25596 // First, we check if the intrinsic may have non-default rounding mode,
25597 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25598 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25599 if (IntrWithRoundingModeOpcode != 0) {
25600 SDValue Rnd = Op.getOperand(4);
25601 unsigned RC = 0;
25602 if (isRoundModeSAEToX(Rnd, RC))
25603 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25604 Src1, Src2, Src3,
25605 DAG.getTargetConstant(RC, dl, MVT::i32));
25606 if (!isRoundModeCurDirection(Rnd))
25607 return SDValue();
25608 }
25609
25610 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25611 {Src1, Src2, Src3});
25612 }
25613 case INTR_TYPE_4OP_IMM8: {
25614 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25615 SDValue Src4 = Op.getOperand(4);
25616 if (Src4.getValueType() != MVT::i8) {
25617 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
25618 }
25619
25620 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25621 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25622 Src4);
25623 }
25624 case INTR_TYPE_1OP_MASK: {
25625 SDValue Src = Op.getOperand(1);
25626 SDValue PassThru = Op.getOperand(2);
25627 SDValue Mask = Op.getOperand(3);
25628 // We add rounding mode to the Node when
25629 // - RC Opcode is specified and
25630 // - RC is not "current direction".
25631 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25632 if (IntrWithRoundingModeOpcode != 0) {
25633 SDValue Rnd = Op.getOperand(4);
25634 unsigned RC = 0;
25635 if (isRoundModeSAEToX(Rnd, RC))
25636 return getVectorMaskingNode(
25637 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25638 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25639 Mask, PassThru, Subtarget, DAG);
25640 if (!isRoundModeCurDirection(Rnd))
25641 return SDValue();
25642 }
25643 return getVectorMaskingNode(
25644 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25645 Subtarget, DAG);
25646 }
25648 SDValue Src = Op.getOperand(1);
25649 SDValue PassThru = Op.getOperand(2);
25650 SDValue Mask = Op.getOperand(3);
25651 SDValue Rnd = Op.getOperand(4);
25652
25653 unsigned Opc;
25654 if (isRoundModeCurDirection(Rnd))
25655 Opc = IntrData->Opc0;
25656 else if (isRoundModeSAE(Rnd))
25657 Opc = IntrData->Opc1;
25658 else
25659 return SDValue();
25660
25661 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25662 Subtarget, DAG);
25663 }
25664 case INTR_TYPE_SCALAR_MASK: {
25665 SDValue Src1 = Op.getOperand(1);
25666 SDValue Src2 = Op.getOperand(2);
25667 SDValue passThru = Op.getOperand(3);
25668 SDValue Mask = Op.getOperand(4);
25669 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25670 // There are 2 kinds of intrinsics in this group:
25671 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25672 // (2) With rounding mode and sae - 7 operands.
25673 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25674 if (Op.getNumOperands() == (5U + HasRounding)) {
25675 if (HasRounding) {
25676 SDValue Rnd = Op.getOperand(5);
25677 unsigned RC = 0;
25678 if (isRoundModeSAEToX(Rnd, RC))
25679 return getScalarMaskingNode(
25680 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25681 DAG.getTargetConstant(RC, dl, MVT::i32)),
25682 Mask, passThru, Subtarget, DAG);
25683 if (!isRoundModeCurDirection(Rnd))
25684 return SDValue();
25685 }
25686 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25687 Src2),
25688 Mask, passThru, Subtarget, DAG);
25689 }
25690
25691 assert(Op.getNumOperands() == (6U + HasRounding) &&
25692 "Unexpected intrinsic form");
25693 SDValue RoundingMode = Op.getOperand(5);
25694 unsigned Opc = IntrData->Opc0;
25695 if (HasRounding) {
25696 SDValue Sae = Op.getOperand(6);
25697 if (isRoundModeSAE(Sae))
25698 Opc = IntrWithRoundingModeOpcode;
25699 else if (!isRoundModeCurDirection(Sae))
25700 return SDValue();
25701 }
25702 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25703 Src2, RoundingMode),
25704 Mask, passThru, Subtarget, DAG);
25705 }
25707 SDValue Src1 = Op.getOperand(1);
25708 SDValue Src2 = Op.getOperand(2);
25709 SDValue passThru = Op.getOperand(3);
25710 SDValue Mask = Op.getOperand(4);
25711 SDValue Rnd = Op.getOperand(5);
25712
25713 SDValue NewOp;
25714 unsigned RC = 0;
25715 if (isRoundModeCurDirection(Rnd))
25716 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25717 else if (isRoundModeSAEToX(Rnd, RC))
25718 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25719 DAG.getTargetConstant(RC, dl, MVT::i32));
25720 else
25721 return SDValue();
25722
25723 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25724 }
25726 SDValue Src1 = Op.getOperand(1);
25727 SDValue Src2 = Op.getOperand(2);
25728 SDValue passThru = Op.getOperand(3);
25729 SDValue Mask = Op.getOperand(4);
25730 SDValue Sae = Op.getOperand(5);
25731 unsigned Opc;
25732 if (isRoundModeCurDirection(Sae))
25733 Opc = IntrData->Opc0;
25734 else if (isRoundModeSAE(Sae))
25735 Opc = IntrData->Opc1;
25736 else
25737 return SDValue();
25738
25739 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25740 Mask, passThru, Subtarget, DAG);
25741 }
25742 case INTR_TYPE_2OP_MASK: {
25743 SDValue Src1 = Op.getOperand(1);
25744 SDValue Src2 = Op.getOperand(2);
25745 SDValue PassThru = Op.getOperand(3);
25746 SDValue Mask = Op.getOperand(4);
25747 SDValue NewOp;
25748 if (IntrData->Opc1 != 0) {
25749 SDValue Rnd = Op.getOperand(5);
25750 unsigned RC = 0;
25751 if (isRoundModeSAEToX(Rnd, RC))
25752 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25753 DAG.getTargetConstant(RC, dl, MVT::i32));
25754 else if (!isRoundModeCurDirection(Rnd))
25755 return SDValue();
25756 }
25757 if (!NewOp)
25758 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25759 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25760 }
25762 SDValue Src1 = Op.getOperand(1);
25763 SDValue Src2 = Op.getOperand(2);
25764 SDValue PassThru = Op.getOperand(3);
25765 SDValue Mask = Op.getOperand(4);
25766
25767 unsigned Opc = IntrData->Opc0;
25768 if (IntrData->Opc1 != 0) {
25769 SDValue Sae = Op.getOperand(5);
25770 if (isRoundModeSAE(Sae))
25771 Opc = IntrData->Opc1;
25772 else if (!isRoundModeCurDirection(Sae))
25773 return SDValue();
25774 }
25775
25776 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25777 Mask, PassThru, Subtarget, DAG);
25778 }
25780 SDValue Src1 = Op.getOperand(1);
25781 SDValue Src2 = Op.getOperand(2);
25782 SDValue Src3 = Op.getOperand(3);
25783 SDValue PassThru = Op.getOperand(4);
25784 SDValue Mask = Op.getOperand(5);
25785 SDValue Sae = Op.getOperand(6);
25786 unsigned Opc;
25787 if (isRoundModeCurDirection(Sae))
25788 Opc = IntrData->Opc0;
25789 else if (isRoundModeSAE(Sae))
25790 Opc = IntrData->Opc1;
25791 else
25792 return SDValue();
25793
25794 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25795 Mask, PassThru, Subtarget, DAG);
25796 }
25798 SDValue Src1 = Op.getOperand(1);
25799 SDValue Src2 = Op.getOperand(2);
25800 SDValue Src3 = Op.getOperand(3);
25801 SDValue PassThru = Op.getOperand(4);
25802 SDValue Mask = Op.getOperand(5);
25803
25804 unsigned Opc = IntrData->Opc0;
25805 if (IntrData->Opc1 != 0) {
25806 SDValue Sae = Op.getOperand(6);
25807 if (isRoundModeSAE(Sae))
25808 Opc = IntrData->Opc1;
25809 else if (!isRoundModeCurDirection(Sae))
25810 return SDValue();
25811 }
25812 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25813 Mask, PassThru, Subtarget, DAG);
25814 }
25815 case BLENDV: {
25816 SDValue Src1 = Op.getOperand(1);
25817 SDValue Src2 = Op.getOperand(2);
25818 SDValue Src3 = Op.getOperand(3);
25819
25821 Src3 = DAG.getBitcast(MaskVT, Src3);
25822
25823 // Reverse the operands to match VSELECT order.
25824 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25825 }
25826 case VPERM_2OP : {
25827 SDValue Src1 = Op.getOperand(1);
25828 SDValue Src2 = Op.getOperand(2);
25829
25830 // Swap Src1 and Src2 in the node creation
25831 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25832 }
25833 case CFMA_OP_MASKZ:
25834 case CFMA_OP_MASK: {
25835 SDValue Src1 = Op.getOperand(1);
25836 SDValue Src2 = Op.getOperand(2);
25837 SDValue Src3 = Op.getOperand(3);
25838 SDValue Mask = Op.getOperand(4);
25839 MVT VT = Op.getSimpleValueType();
25840
25841 SDValue PassThru = Src3;
25842 if (IntrData->Type == CFMA_OP_MASKZ)
25843 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25844
25845 // We add rounding mode to the Node when
25846 // - RC Opcode is specified and
25847 // - RC is not "current direction".
25848 SDValue NewOp;
25849 if (IntrData->Opc1 != 0) {
25850 SDValue Rnd = Op.getOperand(5);
25851 unsigned RC = 0;
25852 if (isRoundModeSAEToX(Rnd, RC))
25853 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
25854 DAG.getTargetConstant(RC, dl, MVT::i32));
25855 else if (!isRoundModeCurDirection(Rnd))
25856 return SDValue();
25857 }
25858 if (!NewOp)
25859 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
25860 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25861 }
25862 case IFMA_OP:
25863 // NOTE: We need to swizzle the operands to pass the multiply operands
25864 // first.
25865 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25866 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25867 case FPCLASSS: {
25868 SDValue Src1 = Op.getOperand(1);
25869 SDValue Imm = Op.getOperand(2);
25870 SDValue Mask = Op.getOperand(3);
25871 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25872 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25873 Subtarget, DAG);
25874 // Need to fill with zeros to ensure the bitcast will produce zeroes
25875 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25876 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25877 DAG.getConstant(0, dl, MVT::v8i1),
25878 FPclassMask, DAG.getIntPtrConstant(0, dl));
25879 return DAG.getBitcast(MVT::i8, Ins);
25880 }
25881
25882 case CMP_MASK_CC: {
25883 MVT MaskVT = Op.getSimpleValueType();
25884 SDValue CC = Op.getOperand(3);
25885 SDValue Mask = Op.getOperand(4);
25886 // We specify 2 possible opcodes for intrinsics with rounding modes.
25887 // First, we check if the intrinsic may have non-default rounding mode,
25888 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25889 if (IntrData->Opc1 != 0) {
25890 SDValue Sae = Op.getOperand(5);
25891 if (isRoundModeSAE(Sae))
25892 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25893 Op.getOperand(2), CC, Mask, Sae);
25894 if (!isRoundModeCurDirection(Sae))
25895 return SDValue();
25896 }
25897 //default rounding mode
25898 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25899 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25900 }
25901 case CMP_MASK_SCALAR_CC: {
25902 SDValue Src1 = Op.getOperand(1);
25903 SDValue Src2 = Op.getOperand(2);
25904 SDValue CC = Op.getOperand(3);
25905 SDValue Mask = Op.getOperand(4);
25906
25907 SDValue Cmp;
25908 if (IntrData->Opc1 != 0) {
25909 SDValue Sae = Op.getOperand(5);
25910 if (isRoundModeSAE(Sae))
25911 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25912 else if (!isRoundModeCurDirection(Sae))
25913 return SDValue();
25914 }
25915 //default rounding mode
25916 if (!Cmp.getNode())
25917 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25918
25919 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25920 Subtarget, DAG);
25921 // Need to fill with zeros to ensure the bitcast will produce zeroes
25922 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25923 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25924 DAG.getConstant(0, dl, MVT::v8i1),
25925 CmpMask, DAG.getIntPtrConstant(0, dl));
25926 return DAG.getBitcast(MVT::i8, Ins);
25927 }
25928 case COMI: { // Comparison intrinsics
25929 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25930 SDValue LHS = Op.getOperand(1);
25931 SDValue RHS = Op.getOperand(2);
25932 // Some conditions require the operands to be swapped.
25933 if (CC == ISD::SETLT || CC == ISD::SETLE)
25934 std::swap(LHS, RHS);
25935
25936 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25937 SDValue SetCC;
25938 switch (CC) {
25939 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25940 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25941 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25942 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25943 break;
25944 }
25945 case ISD::SETNE: { // (ZF = 1 or PF = 1)
25946 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25947 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25948 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25949 break;
25950 }
25951 case ISD::SETGT: // (CF = 0 and ZF = 0)
25952 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25953 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25954 break;
25955 }
25956 case ISD::SETGE: // CF = 0
25957 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25958 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25959 break;
25960 default:
25961 llvm_unreachable("Unexpected illegal condition!");
25962 }
25963 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25964 }
25965 case COMI_RM: { // Comparison intrinsics with Sae
25966 SDValue LHS = Op.getOperand(1);
25967 SDValue RHS = Op.getOperand(2);
25968 unsigned CondVal = Op.getConstantOperandVal(3);
25969 SDValue Sae = Op.getOperand(4);
25970
25971 SDValue FCmp;
25972 if (isRoundModeCurDirection(Sae))
25973 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25974 DAG.getTargetConstant(CondVal, dl, MVT::i8));
25975 else if (isRoundModeSAE(Sae))
25976 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25977 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25978 else
25979 return SDValue();
25980 // Need to fill with zeros to ensure the bitcast will produce zeroes
25981 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25982 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25983 DAG.getConstant(0, dl, MVT::v16i1),
25984 FCmp, DAG.getIntPtrConstant(0, dl));
25985 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25986 DAG.getBitcast(MVT::i16, Ins));
25987 }
25988 case VSHIFT: {
25989 SDValue SrcOp = Op.getOperand(1);
25990 SDValue ShAmt = Op.getOperand(2);
25991 assert(ShAmt.getValueType() == MVT::i32 &&
25992 "Unexpected VSHIFT amount type");
25993
25994 // Catch shift-by-constant.
25995 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
25996 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
25997 Op.getSimpleValueType(), SrcOp,
25998 CShAmt->getZExtValue(), DAG);
25999
26000 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26001 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26002 SrcOp, ShAmt, 0, Subtarget, DAG);
26003 }
26005 SDValue Mask = Op.getOperand(3);
26006 SDValue DataToCompress = Op.getOperand(1);
26007 SDValue PassThru = Op.getOperand(2);
26008 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26009 return Op.getOperand(1);
26010
26011 // Avoid false dependency.
26012 if (PassThru.isUndef())
26013 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26014
26015 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26016 Mask);
26017 }
26018 case FIXUPIMM:
26019 case FIXUPIMM_MASKZ: {
26020 SDValue Src1 = Op.getOperand(1);
26021 SDValue Src2 = Op.getOperand(2);
26022 SDValue Src3 = Op.getOperand(3);
26023 SDValue Imm = Op.getOperand(4);
26024 SDValue Mask = Op.getOperand(5);
26025 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26026 ? Src1
26027 : getZeroVector(VT, Subtarget, DAG, dl);
26028
26029 unsigned Opc = IntrData->Opc0;
26030 if (IntrData->Opc1 != 0) {
26031 SDValue Sae = Op.getOperand(6);
26032 if (isRoundModeSAE(Sae))
26033 Opc = IntrData->Opc1;
26034 else if (!isRoundModeCurDirection(Sae))
26035 return SDValue();
26036 }
26037
26038 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26039
26040 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
26041 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26042
26043 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26044 }
26045 case ROUNDP: {
26046 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26047 // Clear the upper bits of the rounding immediate so that the legacy
26048 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26049 uint64_t Round = Op.getConstantOperandVal(2);
26050 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26051 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26052 Op.getOperand(1), RoundingMode);
26053 }
26054 case ROUNDS: {
26055 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26056 // Clear the upper bits of the rounding immediate so that the legacy
26057 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26058 uint64_t Round = Op.getConstantOperandVal(3);
26059 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26060 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26061 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26062 }
26063 case BEXTRI: {
26064 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26065
26066 uint64_t Imm = Op.getConstantOperandVal(2);
26067 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26068 Op.getValueType());
26069 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26070 Op.getOperand(1), Control);
26071 }
26072 // ADC/SBB
26073 case ADX: {
26074 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26075 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26076
26077 SDValue Res;
26078 // If the carry in is zero, then we should just use ADD/SUB instead of
26079 // ADC/SBB.
26080 if (isNullConstant(Op.getOperand(1))) {
26081 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26082 Op.getOperand(3));
26083 } else {
26084 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26085 DAG.getConstant(-1, dl, MVT::i8));
26086 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26087 Op.getOperand(3), GenCF.getValue(1));
26088 }
26089 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26090 SDValue Results[] = { SetCC, Res };
26091 return DAG.getMergeValues(Results, dl);
26092 }
26093 case CVTPD2PS_MASK:
26094 case CVTPD2DQ_MASK:
26095 case CVTQQ2PS_MASK:
26096 case TRUNCATE_TO_REG: {
26097 SDValue Src = Op.getOperand(1);
26098 SDValue PassThru = Op.getOperand(2);
26099 SDValue Mask = Op.getOperand(3);
26100
26101 if (isAllOnesConstant(Mask))
26102 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26103
26104 MVT SrcVT = Src.getSimpleValueType();
26105 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26106 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26107 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26108 {Src, PassThru, Mask});
26109 }
26110 case CVTPS2PH_MASK: {
26111 SDValue Src = Op.getOperand(1);
26112 SDValue Rnd = Op.getOperand(2);
26113 SDValue PassThru = Op.getOperand(3);
26114 SDValue Mask = Op.getOperand(4);
26115
26116 unsigned RC = 0;
26117 unsigned Opc = IntrData->Opc0;
26118 bool SAE = Src.getValueType().is512BitVector() &&
26119 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
26120 if (SAE) {
26122 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
26123 }
26124
26125 if (isAllOnesConstant(Mask))
26126 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
26127
26128 if (SAE)
26130 else
26131 Opc = IntrData->Opc1;
26132 MVT SrcVT = Src.getSimpleValueType();
26133 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26134 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26135 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
26136 }
26137 case CVTNEPS2BF16_MASK: {
26138 SDValue Src = Op.getOperand(1);
26139 SDValue PassThru = Op.getOperand(2);
26140 SDValue Mask = Op.getOperand(3);
26141
26142 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26143 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26144
26145 // Break false dependency.
26146 if (PassThru.isUndef())
26147 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26148
26149 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26150 Mask);
26151 }
26152 default:
26153 break;
26154 }
26155 }
26156
26157 switch (IntNo) {
26158 default: return SDValue(); // Don't custom lower most intrinsics.
26159
26160 // ptest and testp intrinsics. The intrinsic these come from are designed to
26161 // return an integer value, not just an instruction so lower it to the ptest
26162 // or testp pattern and a setcc for the result.
26163 case Intrinsic::x86_avx512_ktestc_b:
26164 case Intrinsic::x86_avx512_ktestc_w:
26165 case Intrinsic::x86_avx512_ktestc_d:
26166 case Intrinsic::x86_avx512_ktestc_q:
26167 case Intrinsic::x86_avx512_ktestz_b:
26168 case Intrinsic::x86_avx512_ktestz_w:
26169 case Intrinsic::x86_avx512_ktestz_d:
26170 case Intrinsic::x86_avx512_ktestz_q:
26171 case Intrinsic::x86_sse41_ptestz:
26172 case Intrinsic::x86_sse41_ptestc:
26173 case Intrinsic::x86_sse41_ptestnzc:
26174 case Intrinsic::x86_avx_ptestz_256:
26175 case Intrinsic::x86_avx_ptestc_256:
26176 case Intrinsic::x86_avx_ptestnzc_256:
26177 case Intrinsic::x86_avx_vtestz_ps:
26178 case Intrinsic::x86_avx_vtestc_ps:
26179 case Intrinsic::x86_avx_vtestnzc_ps:
26180 case Intrinsic::x86_avx_vtestz_pd:
26181 case Intrinsic::x86_avx_vtestc_pd:
26182 case Intrinsic::x86_avx_vtestnzc_pd:
26183 case Intrinsic::x86_avx_vtestz_ps_256:
26184 case Intrinsic::x86_avx_vtestc_ps_256:
26185 case Intrinsic::x86_avx_vtestnzc_ps_256:
26186 case Intrinsic::x86_avx_vtestz_pd_256:
26187 case Intrinsic::x86_avx_vtestc_pd_256:
26188 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26189 unsigned TestOpc = X86ISD::PTEST;
26190 X86::CondCode X86CC;
26191 switch (IntNo) {
26192 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
26193 case Intrinsic::x86_avx512_ktestc_b:
26194 case Intrinsic::x86_avx512_ktestc_w:
26195 case Intrinsic::x86_avx512_ktestc_d:
26196 case Intrinsic::x86_avx512_ktestc_q:
26197 // CF = 1
26198 TestOpc = X86ISD::KTEST;
26199 X86CC = X86::COND_B;
26200 break;
26201 case Intrinsic::x86_avx512_ktestz_b:
26202 case Intrinsic::x86_avx512_ktestz_w:
26203 case Intrinsic::x86_avx512_ktestz_d:
26204 case Intrinsic::x86_avx512_ktestz_q:
26205 TestOpc = X86ISD::KTEST;
26206 X86CC = X86::COND_E;
26207 break;
26208 case Intrinsic::x86_avx_vtestz_ps:
26209 case Intrinsic::x86_avx_vtestz_pd:
26210 case Intrinsic::x86_avx_vtestz_ps_256:
26211 case Intrinsic::x86_avx_vtestz_pd_256:
26212 TestOpc = X86ISD::TESTP;
26213 [[fallthrough]];
26214 case Intrinsic::x86_sse41_ptestz:
26215 case Intrinsic::x86_avx_ptestz_256:
26216 // ZF = 1
26217 X86CC = X86::COND_E;
26218 break;
26219 case Intrinsic::x86_avx_vtestc_ps:
26220 case Intrinsic::x86_avx_vtestc_pd:
26221 case Intrinsic::x86_avx_vtestc_ps_256:
26222 case Intrinsic::x86_avx_vtestc_pd_256:
26223 TestOpc = X86ISD::TESTP;
26224 [[fallthrough]];
26225 case Intrinsic::x86_sse41_ptestc:
26226 case Intrinsic::x86_avx_ptestc_256:
26227 // CF = 1
26228 X86CC = X86::COND_B;
26229 break;
26230 case Intrinsic::x86_avx_vtestnzc_ps:
26231 case Intrinsic::x86_avx_vtestnzc_pd:
26232 case Intrinsic::x86_avx_vtestnzc_ps_256:
26233 case Intrinsic::x86_avx_vtestnzc_pd_256:
26234 TestOpc = X86ISD::TESTP;
26235 [[fallthrough]];
26236 case Intrinsic::x86_sse41_ptestnzc:
26237 case Intrinsic::x86_avx_ptestnzc_256:
26238 // ZF and CF = 0
26239 X86CC = X86::COND_A;
26240 break;
26241 }
26242
26243 SDValue LHS = Op.getOperand(1);
26244 SDValue RHS = Op.getOperand(2);
26245 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26246 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26247 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26248 }
26249
26250 case Intrinsic::x86_sse42_pcmpistria128:
26251 case Intrinsic::x86_sse42_pcmpestria128:
26252 case Intrinsic::x86_sse42_pcmpistric128:
26253 case Intrinsic::x86_sse42_pcmpestric128:
26254 case Intrinsic::x86_sse42_pcmpistrio128:
26255 case Intrinsic::x86_sse42_pcmpestrio128:
26256 case Intrinsic::x86_sse42_pcmpistris128:
26257 case Intrinsic::x86_sse42_pcmpestris128:
26258 case Intrinsic::x86_sse42_pcmpistriz128:
26259 case Intrinsic::x86_sse42_pcmpestriz128: {
26260 unsigned Opcode;
26261 X86::CondCode X86CC;
26262 switch (IntNo) {
26263 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26264 case Intrinsic::x86_sse42_pcmpistria128:
26265 Opcode = X86ISD::PCMPISTR;
26266 X86CC = X86::COND_A;
26267 break;
26268 case Intrinsic::x86_sse42_pcmpestria128:
26269 Opcode = X86ISD::PCMPESTR;
26270 X86CC = X86::COND_A;
26271 break;
26272 case Intrinsic::x86_sse42_pcmpistric128:
26273 Opcode = X86ISD::PCMPISTR;
26274 X86CC = X86::COND_B;
26275 break;
26276 case Intrinsic::x86_sse42_pcmpestric128:
26277 Opcode = X86ISD::PCMPESTR;
26278 X86CC = X86::COND_B;
26279 break;
26280 case Intrinsic::x86_sse42_pcmpistrio128:
26281 Opcode = X86ISD::PCMPISTR;
26282 X86CC = X86::COND_O;
26283 break;
26284 case Intrinsic::x86_sse42_pcmpestrio128:
26285 Opcode = X86ISD::PCMPESTR;
26286 X86CC = X86::COND_O;
26287 break;
26288 case Intrinsic::x86_sse42_pcmpistris128:
26289 Opcode = X86ISD::PCMPISTR;
26290 X86CC = X86::COND_S;
26291 break;
26292 case Intrinsic::x86_sse42_pcmpestris128:
26293 Opcode = X86ISD::PCMPESTR;
26294 X86CC = X86::COND_S;
26295 break;
26296 case Intrinsic::x86_sse42_pcmpistriz128:
26297 Opcode = X86ISD::PCMPISTR;
26298 X86CC = X86::COND_E;
26299 break;
26300 case Intrinsic::x86_sse42_pcmpestriz128:
26301 Opcode = X86ISD::PCMPESTR;
26302 X86CC = X86::COND_E;
26303 break;
26304 }
26306 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26307 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26308 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26309 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26310 }
26311
26312 case Intrinsic::x86_sse42_pcmpistri128:
26313 case Intrinsic::x86_sse42_pcmpestri128: {
26314 unsigned Opcode;
26315 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26316 Opcode = X86ISD::PCMPISTR;
26317 else
26318 Opcode = X86ISD::PCMPESTR;
26319
26321 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26322 return DAG.getNode(Opcode, dl, VTs, NewOps);
26323 }
26324
26325 case Intrinsic::x86_sse42_pcmpistrm128:
26326 case Intrinsic::x86_sse42_pcmpestrm128: {
26327 unsigned Opcode;
26328 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26329 Opcode = X86ISD::PCMPISTR;
26330 else
26331 Opcode = X86ISD::PCMPESTR;
26332
26334 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26335 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26336 }
26337
26338 case Intrinsic::eh_sjlj_lsda: {
26340 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26341 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26342 auto &Context = MF.getContext();
26343 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26344 Twine(MF.getFunctionNumber()));
26345 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
26346 DAG.getMCSymbol(S, PtrVT));
26347 }
26348
26349 case Intrinsic::x86_seh_lsda: {
26350 // Compute the symbol for the LSDA. We know it'll get emitted later.
26352 SDValue Op1 = Op.getOperand(1);
26353 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26356
26357 // Generate a simple absolute symbol reference. This intrinsic is only
26358 // supported on 32-bit Windows, which isn't PIC.
26359 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26360 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26361 }
26362
26363 case Intrinsic::eh_recoverfp: {
26364 SDValue FnOp = Op.getOperand(1);
26365 SDValue IncomingFPOp = Op.getOperand(2);
26366 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26367 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26368 if (!Fn)
26370 "llvm.eh.recoverfp must take a function as the first argument");
26371 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26372 }
26373
26374 case Intrinsic::localaddress: {
26375 // Returns one of the stack, base, or frame pointer registers, depending on
26376 // which is used to reference local variables.
26378 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26379 unsigned Reg;
26380 if (RegInfo->hasBasePointer(MF))
26381 Reg = RegInfo->getBaseRegister();
26382 else { // Handles the SP or FP case.
26383 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26384 if (CantUseFP)
26385 Reg = RegInfo->getPtrSizedStackRegister(MF);
26386 else
26387 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26388 }
26389 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26390 }
26391 case Intrinsic::x86_avx512_vp2intersect_q_512:
26392 case Intrinsic::x86_avx512_vp2intersect_q_256:
26393 case Intrinsic::x86_avx512_vp2intersect_q_128:
26394 case Intrinsic::x86_avx512_vp2intersect_d_512:
26395 case Intrinsic::x86_avx512_vp2intersect_d_256:
26396 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26397 MVT MaskVT = Op.getSimpleValueType();
26398
26399 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26400 SDLoc DL(Op);
26401
26404 Op->getOperand(1), Op->getOperand(2));
26405
26406 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26407 MaskVT, Operation);
26408 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26409 MaskVT, Operation);
26410 return DAG.getMergeValues({Result0, Result1}, DL);
26411 }
26412 case Intrinsic::x86_mmx_pslli_w:
26413 case Intrinsic::x86_mmx_pslli_d:
26414 case Intrinsic::x86_mmx_pslli_q:
26415 case Intrinsic::x86_mmx_psrli_w:
26416 case Intrinsic::x86_mmx_psrli_d:
26417 case Intrinsic::x86_mmx_psrli_q:
26418 case Intrinsic::x86_mmx_psrai_w:
26419 case Intrinsic::x86_mmx_psrai_d: {
26420 SDLoc DL(Op);
26421 SDValue ShAmt = Op.getOperand(2);
26422 // If the argument is a constant, convert it to a target constant.
26423 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26424 // Clamp out of bounds shift amounts since they will otherwise be masked
26425 // to 8-bits which may make it no longer out of bounds.
26426 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26427 if (ShiftAmount == 0)
26428 return Op.getOperand(1);
26429
26430 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26431 Op.getOperand(0), Op.getOperand(1),
26432 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26433 }
26434
26435 unsigned NewIntrinsic;
26436 switch (IntNo) {
26437 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26438 case Intrinsic::x86_mmx_pslli_w:
26439 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26440 break;
26441 case Intrinsic::x86_mmx_pslli_d:
26442 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26443 break;
26444 case Intrinsic::x86_mmx_pslli_q:
26445 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26446 break;
26447 case Intrinsic::x86_mmx_psrli_w:
26448 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26449 break;
26450 case Intrinsic::x86_mmx_psrli_d:
26451 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26452 break;
26453 case Intrinsic::x86_mmx_psrli_q:
26454 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26455 break;
26456 case Intrinsic::x86_mmx_psrai_w:
26457 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26458 break;
26459 case Intrinsic::x86_mmx_psrai_d:
26460 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26461 break;
26462 }
26463
26464 // The vector shift intrinsics with scalars uses 32b shift amounts but
26465 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26466 // MMX register.
26467 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26468 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26469 DAG.getTargetConstant(NewIntrinsic, DL,
26471 Op.getOperand(1), ShAmt);
26472 }
26473 case Intrinsic::thread_pointer: {
26474 if (Subtarget.isTargetELF()) {
26475 SDLoc dl(Op);
26476 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26477 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
26479 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
26480 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26482 }
26484 "Target OS doesn't support __builtin_thread_pointer() yet.");
26485 }
26486 }
26487}
26488
26490 SDValue Src, SDValue Mask, SDValue Base,
26491 SDValue Index, SDValue ScaleOp, SDValue Chain,
26492 const X86Subtarget &Subtarget) {
26493 SDLoc dl(Op);
26494 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26495 // Scale must be constant.
26496 if (!C)
26497 return SDValue();
26498 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26499 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26500 TLI.getPointerTy(DAG.getDataLayout()));
26501 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26502 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26503 // If source is undef or we know it won't be used, use a zero vector
26504 // to break register dependency.
26505 // TODO: use undef instead and let BreakFalseDeps deal with it?
26506 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26507 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26508
26509 // Cast mask to an integer type.
26510 Mask = DAG.getBitcast(MaskVT, Mask);
26511
26512 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26513
26514 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26515 SDValue Res =
26516 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26517 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26518 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26519}
26520
26522 SDValue Src, SDValue Mask, SDValue Base,
26523 SDValue Index, SDValue ScaleOp, SDValue Chain,
26524 const X86Subtarget &Subtarget) {
26525 MVT VT = Op.getSimpleValueType();
26526 SDLoc dl(Op);
26527 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26528 // Scale must be constant.
26529 if (!C)
26530 return SDValue();
26531 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26532 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26533 TLI.getPointerTy(DAG.getDataLayout()));
26534 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26536 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26537
26538 // We support two versions of the gather intrinsics. One with scalar mask and
26539 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26540 if (Mask.getValueType() != MaskVT)
26541 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26542
26543 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26544 // If source is undef or we know it won't be used, use a zero vector
26545 // to break register dependency.
26546 // TODO: use undef instead and let BreakFalseDeps deal with it?
26547 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26548 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26549
26550 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26551
26552 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26553 SDValue Res =
26554 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26555 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26556 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26557}
26558
26559static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26560 SDValue Src, SDValue Mask, SDValue Base,
26561 SDValue Index, SDValue ScaleOp, SDValue Chain,
26562 const X86Subtarget &Subtarget) {
26563 SDLoc dl(Op);
26564 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26565 // Scale must be constant.
26566 if (!C)
26567 return SDValue();
26568 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26569 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26570 TLI.getPointerTy(DAG.getDataLayout()));
26571 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26572 Src.getSimpleValueType().getVectorNumElements());
26573 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26574
26575 // We support two versions of the scatter intrinsics. One with scalar mask and
26576 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26577 if (Mask.getValueType() != MaskVT)
26578 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26579
26580 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26581
26582 SDVTList VTs = DAG.getVTList(MVT::Other);
26583 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26584 SDValue Res =
26585 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26586 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26587 return Res;
26588}
26589
26590static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26592 SDValue ScaleOp, SDValue Chain,
26593 const X86Subtarget &Subtarget) {
26594 SDLoc dl(Op);
26595 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26596 // Scale must be constant.
26597 if (!C)
26598 return SDValue();
26599 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26600 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26601 TLI.getPointerTy(DAG.getDataLayout()));
26602 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26603 SDValue Segment = DAG.getRegister(0, MVT::i32);
26604 MVT MaskVT =
26605 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26606 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26607 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26608 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26609 return SDValue(Res, 0);
26610}
26611
26612/// Handles the lowering of builtin intrinsics with chain that return their
26613/// value into registers EDX:EAX.
26614/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26615/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26616/// TargetOpcode.
26617/// Returns a Glue value which can be used to add extra copy-from-reg if the
26618/// expanded intrinsics implicitly defines extra registers (i.e. not just
26619/// EDX:EAX).
26621 SelectionDAG &DAG,
26622 unsigned TargetOpcode,
26623 unsigned SrcReg,
26624 const X86Subtarget &Subtarget,
26626 SDValue Chain = N->getOperand(0);
26627 SDValue Glue;
26628
26629 if (SrcReg) {
26630 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26631 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26632 Glue = Chain.getValue(1);
26633 }
26634
26635 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26636 SDValue N1Ops[] = {Chain, Glue};
26637 SDNode *N1 = DAG.getMachineNode(
26638 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26639 Chain = SDValue(N1, 0);
26640
26641 // Reads the content of XCR and returns it in registers EDX:EAX.
26642 SDValue LO, HI;
26643 if (Subtarget.is64Bit()) {
26644 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26645 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26646 LO.getValue(2));
26647 } else {
26648 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26649 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26650 LO.getValue(2));
26651 }
26652 Chain = HI.getValue(1);
26653 Glue = HI.getValue(2);
26654
26655 if (Subtarget.is64Bit()) {
26656 // Merge the two 32-bit values into a 64-bit one.
26657 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26658 DAG.getConstant(32, DL, MVT::i8));
26659 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26660 Results.push_back(Chain);
26661 return Glue;
26662 }
26663
26664 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26665 SDValue Ops[] = { LO, HI };
26666 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26667 Results.push_back(Pair);
26668 Results.push_back(Chain);
26669 return Glue;
26670}
26671
26672/// Handles the lowering of builtin intrinsics that read the time stamp counter
26673/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26674/// READCYCLECOUNTER nodes.
26675static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26676 SelectionDAG &DAG,
26677 const X86Subtarget &Subtarget,
26679 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26680 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26681 // and the EAX register is loaded with the low-order 32 bits.
26682 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26683 /* NoRegister */0, Subtarget,
26684 Results);
26685 if (Opcode != X86::RDTSCP)
26686 return;
26687
26688 SDValue Chain = Results[1];
26689 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26690 // the ECX register. Add 'ecx' explicitly to the chain.
26691 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26692 Results[1] = ecx;
26693 Results.push_back(ecx.getValue(1));
26694}
26695
26697 SelectionDAG &DAG) {
26699 SDLoc DL(Op);
26700 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26701 Results);
26702 return DAG.getMergeValues(Results, DL);
26703}
26704
26707 SDValue Chain = Op.getOperand(0);
26708 SDValue RegNode = Op.getOperand(2);
26709 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26710 if (!EHInfo)
26711 report_fatal_error("EH registrations only live in functions using WinEH");
26712
26713 // Cast the operand to an alloca, and remember the frame index.
26714 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26715 if (!FINode)
26716 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26717 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26718
26719 // Return the chain operand without making any DAG nodes.
26720 return Chain;
26721}
26722
26725 SDValue Chain = Op.getOperand(0);
26726 SDValue EHGuard = Op.getOperand(2);
26727 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26728 if (!EHInfo)
26729 report_fatal_error("EHGuard only live in functions using WinEH");
26730
26731 // Cast the operand to an alloca, and remember the frame index.
26732 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26733 if (!FINode)
26734 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26735 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26736
26737 // Return the chain operand without making any DAG nodes.
26738 return Chain;
26739}
26740
26741/// Emit Truncating Store with signed or unsigned saturation.
26742static SDValue
26743EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
26744 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26745 SelectionDAG &DAG) {
26746 SDVTList VTs = DAG.getVTList(MVT::Other);
26747 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26748 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26749 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26750 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26751}
26752
26753/// Emit Masked Truncating Store with signed or unsigned saturation.
26754static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
26755 const SDLoc &DL,
26756 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26757 MachineMemOperand *MMO, SelectionDAG &DAG) {
26758 SDVTList VTs = DAG.getVTList(MVT::Other);
26759 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26760 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26761 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26762}
26763
26765 const MachineFunction &MF) {
26766 if (!Subtarget.is64Bit())
26767 return false;
26768 // 64-bit targets support extended Swift async frame setup,
26769 // except for targets that use the windows 64 prologue.
26770 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
26771}
26772
26774 SelectionDAG &DAG) {
26775 unsigned IntNo = Op.getConstantOperandVal(1);
26776 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26777 if (!IntrData) {
26778 switch (IntNo) {
26779
26780 case Intrinsic::swift_async_context_addr: {
26781 SDLoc dl(Op);
26782 auto &MF = DAG.getMachineFunction();
26783 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
26784 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
26786 X86FI->setHasSwiftAsyncContext(true);
26787 SDValue Chain = Op->getOperand(0);
26788 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
26789 SDValue Result =
26790 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
26791 DAG.getTargetConstant(8, dl, MVT::i32)),
26792 0);
26793 // Return { result, chain }.
26794 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26795 CopyRBP.getValue(1));
26796 } else {
26797 // No special extended frame, create or reuse an existing stack slot.
26798 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
26799 if (!X86FI->getSwiftAsyncContextFrameIdx())
26800 X86FI->setSwiftAsyncContextFrameIdx(
26801 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
26802 false));
26803 SDValue Result =
26804 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
26805 PtrSize == 8 ? MVT::i64 : MVT::i32);
26806 // Return { result, chain }.
26807 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26808 Op->getOperand(0));
26809 }
26810 }
26811
26812 case llvm::Intrinsic::x86_seh_ehregnode:
26813 return MarkEHRegistrationNode(Op, DAG);
26814 case llvm::Intrinsic::x86_seh_ehguard:
26815 return MarkEHGuard(Op, DAG);
26816 case llvm::Intrinsic::x86_rdpkru: {
26817 SDLoc dl(Op);
26818 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26819 // Create a RDPKRU node and pass 0 to the ECX parameter.
26820 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26821 DAG.getConstant(0, dl, MVT::i32));
26822 }
26823 case llvm::Intrinsic::x86_wrpkru: {
26824 SDLoc dl(Op);
26825 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26826 // to the EDX and ECX parameters.
26827 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26828 Op.getOperand(0), Op.getOperand(2),
26829 DAG.getConstant(0, dl, MVT::i32),
26830 DAG.getConstant(0, dl, MVT::i32));
26831 }
26832 case llvm::Intrinsic::asan_check_memaccess: {
26833 // Mark this as adjustsStack because it will be lowered to a call.
26835 // Don't do anything here, we will expand these intrinsics out later.
26836 return Op;
26837 }
26838 case llvm::Intrinsic::x86_flags_read_u32:
26839 case llvm::Intrinsic::x86_flags_read_u64:
26840 case llvm::Intrinsic::x86_flags_write_u32:
26841 case llvm::Intrinsic::x86_flags_write_u64: {
26842 // We need a frame pointer because this will get lowered to a PUSH/POP
26843 // sequence.
26846 // Don't do anything here, we will expand these intrinsics out later
26847 // during FinalizeISel in EmitInstrWithCustomInserter.
26848 return Op;
26849 }
26850 case Intrinsic::x86_lwpins32:
26851 case Intrinsic::x86_lwpins64:
26852 case Intrinsic::x86_umwait:
26853 case Intrinsic::x86_tpause: {
26854 SDLoc dl(Op);
26855 SDValue Chain = Op->getOperand(0);
26856 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26857 unsigned Opcode;
26858
26859 switch (IntNo) {
26860 default: llvm_unreachable("Impossible intrinsic");
26861 case Intrinsic::x86_umwait:
26862 Opcode = X86ISD::UMWAIT;
26863 break;
26864 case Intrinsic::x86_tpause:
26865 Opcode = X86ISD::TPAUSE;
26866 break;
26867 case Intrinsic::x86_lwpins32:
26868 case Intrinsic::x86_lwpins64:
26869 Opcode = X86ISD::LWPINS;
26870 break;
26871 }
26872
26874 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26875 Op->getOperand(3), Op->getOperand(4));
26876 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26877 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26878 Operation.getValue(1));
26879 }
26880 case Intrinsic::x86_enqcmd:
26881 case Intrinsic::x86_enqcmds: {
26882 SDLoc dl(Op);
26883 SDValue Chain = Op.getOperand(0);
26884 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26885 unsigned Opcode;
26886 switch (IntNo) {
26887 default: llvm_unreachable("Impossible intrinsic!");
26888 case Intrinsic::x86_enqcmd:
26889 Opcode = X86ISD::ENQCMD;
26890 break;
26891 case Intrinsic::x86_enqcmds:
26892 Opcode = X86ISD::ENQCMDS;
26893 break;
26894 }
26895 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26896 Op.getOperand(3));
26897 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26898 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26899 Operation.getValue(1));
26900 }
26901 case Intrinsic::x86_aesenc128kl:
26902 case Intrinsic::x86_aesdec128kl:
26903 case Intrinsic::x86_aesenc256kl:
26904 case Intrinsic::x86_aesdec256kl: {
26905 SDLoc DL(Op);
26906 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26907 SDValue Chain = Op.getOperand(0);
26908 unsigned Opcode;
26909
26910 switch (IntNo) {
26911 default: llvm_unreachable("Impossible intrinsic");
26912 case Intrinsic::x86_aesenc128kl:
26913 Opcode = X86ISD::AESENC128KL;
26914 break;
26915 case Intrinsic::x86_aesdec128kl:
26916 Opcode = X86ISD::AESDEC128KL;
26917 break;
26918 case Intrinsic::x86_aesenc256kl:
26919 Opcode = X86ISD::AESENC256KL;
26920 break;
26921 case Intrinsic::x86_aesdec256kl:
26922 Opcode = X86ISD::AESDEC256KL;
26923 break;
26924 }
26925
26926 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26927 MachineMemOperand *MMO = MemIntr->getMemOperand();
26928 EVT MemVT = MemIntr->getMemoryVT();
26930 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26931 MMO);
26932 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26933
26934 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26935 {ZF, Operation.getValue(0), Operation.getValue(2)});
26936 }
26937 case Intrinsic::x86_aesencwide128kl:
26938 case Intrinsic::x86_aesdecwide128kl:
26939 case Intrinsic::x86_aesencwide256kl:
26940 case Intrinsic::x86_aesdecwide256kl: {
26941 SDLoc DL(Op);
26942 SDVTList VTs = DAG.getVTList(
26943 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26944 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26945 SDValue Chain = Op.getOperand(0);
26946 unsigned Opcode;
26947
26948 switch (IntNo) {
26949 default: llvm_unreachable("Impossible intrinsic");
26950 case Intrinsic::x86_aesencwide128kl:
26951 Opcode = X86ISD::AESENCWIDE128KL;
26952 break;
26953 case Intrinsic::x86_aesdecwide128kl:
26954 Opcode = X86ISD::AESDECWIDE128KL;
26955 break;
26956 case Intrinsic::x86_aesencwide256kl:
26957 Opcode = X86ISD::AESENCWIDE256KL;
26958 break;
26959 case Intrinsic::x86_aesdecwide256kl:
26960 Opcode = X86ISD::AESDECWIDE256KL;
26961 break;
26962 }
26963
26964 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26965 MachineMemOperand *MMO = MemIntr->getMemOperand();
26966 EVT MemVT = MemIntr->getMemoryVT();
26968 Opcode, DL, VTs,
26969 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26970 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26971 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26972 MemVT, MMO);
26973 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26974
26975 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26976 {ZF, Operation.getValue(1), Operation.getValue(2),
26977 Operation.getValue(3), Operation.getValue(4),
26978 Operation.getValue(5), Operation.getValue(6),
26979 Operation.getValue(7), Operation.getValue(8),
26980 Operation.getValue(9)});
26981 }
26982 case Intrinsic::x86_testui: {
26983 SDLoc dl(Op);
26984 SDValue Chain = Op.getOperand(0);
26985 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26986 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26987 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26988 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26989 Operation.getValue(1));
26990 }
26991 case Intrinsic::x86_atomic_bts_rm:
26992 case Intrinsic::x86_atomic_btc_rm:
26993 case Intrinsic::x86_atomic_btr_rm: {
26994 SDLoc DL(Op);
26995 MVT VT = Op.getSimpleValueType();
26996 SDValue Chain = Op.getOperand(0);
26997 SDValue Op1 = Op.getOperand(2);
26998 SDValue Op2 = Op.getOperand(3);
26999 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
27000 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
27002 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27003 SDValue Res =
27004 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27005 {Chain, Op1, Op2}, VT, MMO);
27006 Chain = Res.getValue(1);
27007 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27008 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27009 }
27010 case Intrinsic::x86_atomic_bts:
27011 case Intrinsic::x86_atomic_btc:
27012 case Intrinsic::x86_atomic_btr: {
27013 SDLoc DL(Op);
27014 MVT VT = Op.getSimpleValueType();
27015 SDValue Chain = Op.getOperand(0);
27016 SDValue Op1 = Op.getOperand(2);
27017 SDValue Op2 = Op.getOperand(3);
27018 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
27019 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
27020 : X86ISD::LBTR;
27021 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
27022 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27023 SDValue Res =
27024 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27025 {Chain, Op1, Op2, Size}, VT, MMO);
27026 Chain = Res.getValue(1);
27027 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27028 unsigned Imm = Op2->getAsZExtVal();
27029 if (Imm)
27030 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
27031 DAG.getShiftAmountConstant(Imm, VT, DL));
27032 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27033 }
27034 case Intrinsic::x86_cmpccxadd32:
27035 case Intrinsic::x86_cmpccxadd64: {
27036 SDLoc DL(Op);
27037 SDValue Chain = Op.getOperand(0);
27038 SDValue Addr = Op.getOperand(2);
27039 SDValue Src1 = Op.getOperand(3);
27040 SDValue Src2 = Op.getOperand(4);
27041 SDValue CC = Op.getOperand(5);
27042 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27044 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
27045 MVT::i32, MMO);
27046 return Operation;
27047 }
27048 case Intrinsic::x86_aadd32:
27049 case Intrinsic::x86_aadd64:
27050 case Intrinsic::x86_aand32:
27051 case Intrinsic::x86_aand64:
27052 case Intrinsic::x86_aor32:
27053 case Intrinsic::x86_aor64:
27054 case Intrinsic::x86_axor32:
27055 case Intrinsic::x86_axor64: {
27056 SDLoc DL(Op);
27057 SDValue Chain = Op.getOperand(0);
27058 SDValue Op1 = Op.getOperand(2);
27059 SDValue Op2 = Op.getOperand(3);
27060 MVT VT = Op2.getSimpleValueType();
27061 unsigned Opc = 0;
27062 switch (IntNo) {
27063 default:
27064 llvm_unreachable("Unknown Intrinsic");
27065 case Intrinsic::x86_aadd32:
27066 case Intrinsic::x86_aadd64:
27067 Opc = X86ISD::AADD;
27068 break;
27069 case Intrinsic::x86_aand32:
27070 case Intrinsic::x86_aand64:
27071 Opc = X86ISD::AAND;
27072 break;
27073 case Intrinsic::x86_aor32:
27074 case Intrinsic::x86_aor64:
27075 Opc = X86ISD::AOR;
27076 break;
27077 case Intrinsic::x86_axor32:
27078 case Intrinsic::x86_axor64:
27079 Opc = X86ISD::AXOR;
27080 break;
27081 }
27082 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27083 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27084 {Chain, Op1, Op2}, VT, MMO);
27085 }
27086 case Intrinsic::x86_atomic_add_cc:
27087 case Intrinsic::x86_atomic_sub_cc:
27088 case Intrinsic::x86_atomic_or_cc:
27089 case Intrinsic::x86_atomic_and_cc:
27090 case Intrinsic::x86_atomic_xor_cc: {
27091 SDLoc DL(Op);
27092 SDValue Chain = Op.getOperand(0);
27093 SDValue Op1 = Op.getOperand(2);
27094 SDValue Op2 = Op.getOperand(3);
27095 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
27096 MVT VT = Op2.getSimpleValueType();
27097 unsigned Opc = 0;
27098 switch (IntNo) {
27099 default:
27100 llvm_unreachable("Unknown Intrinsic");
27101 case Intrinsic::x86_atomic_add_cc:
27102 Opc = X86ISD::LADD;
27103 break;
27104 case Intrinsic::x86_atomic_sub_cc:
27105 Opc = X86ISD::LSUB;
27106 break;
27107 case Intrinsic::x86_atomic_or_cc:
27108 Opc = X86ISD::LOR;
27109 break;
27110 case Intrinsic::x86_atomic_and_cc:
27111 Opc = X86ISD::LAND;
27112 break;
27113 case Intrinsic::x86_atomic_xor_cc:
27114 Opc = X86ISD::LXOR;
27115 break;
27116 }
27117 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27118 SDValue LockArith =
27119 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27120 {Chain, Op1, Op2}, VT, MMO);
27121 Chain = LockArith.getValue(1);
27122 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
27123 }
27124 }
27125 return SDValue();
27126 }
27127
27128 SDLoc dl(Op);
27129 switch(IntrData->Type) {
27130 default: llvm_unreachable("Unknown Intrinsic Type");
27131 case RDSEED:
27132 case RDRAND: {
27133 // Emit the node with the right value type.
27134 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27135 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27136
27137 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27138 // Otherwise return the value from Rand, which is always 0, casted to i32.
27139 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27140 DAG.getConstant(1, dl, Op->getValueType(1)),
27141 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27142 SDValue(Result.getNode(), 1)};
27143 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27144
27145 // Return { result, isValid, chain }.
27146 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27147 SDValue(Result.getNode(), 2));
27148 }
27149 case GATHER_AVX2: {
27150 SDValue Chain = Op.getOperand(0);
27151 SDValue Src = Op.getOperand(2);
27152 SDValue Base = Op.getOperand(3);
27153 SDValue Index = Op.getOperand(4);
27154 SDValue Mask = Op.getOperand(5);
27155 SDValue Scale = Op.getOperand(6);
27156 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27157 Scale, Chain, Subtarget);
27158 }
27159 case GATHER: {
27160 //gather(v1, mask, index, base, scale);
27161 SDValue Chain = Op.getOperand(0);
27162 SDValue Src = Op.getOperand(2);
27163 SDValue Base = Op.getOperand(3);
27164 SDValue Index = Op.getOperand(4);
27165 SDValue Mask = Op.getOperand(5);
27166 SDValue Scale = Op.getOperand(6);
27167 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27168 Chain, Subtarget);
27169 }
27170 case SCATTER: {
27171 //scatter(base, mask, index, v1, scale);
27172 SDValue Chain = Op.getOperand(0);
27173 SDValue Base = Op.getOperand(2);
27174 SDValue Mask = Op.getOperand(3);
27175 SDValue Index = Op.getOperand(4);
27176 SDValue Src = Op.getOperand(5);
27177 SDValue Scale = Op.getOperand(6);
27178 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27179 Scale, Chain, Subtarget);
27180 }
27181 case PREFETCH: {
27182 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27183 assert((HintVal == 2 || HintVal == 3) &&
27184 "Wrong prefetch hint in intrinsic: should be 2 or 3");
27185 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27186 SDValue Chain = Op.getOperand(0);
27187 SDValue Mask = Op.getOperand(2);
27188 SDValue Index = Op.getOperand(3);
27189 SDValue Base = Op.getOperand(4);
27190 SDValue Scale = Op.getOperand(5);
27191 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27192 Subtarget);
27193 }
27194 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27195 case RDTSC: {
27197 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27198 Results);
27199 return DAG.getMergeValues(Results, dl);
27200 }
27201 // Read Performance Monitoring Counters.
27202 case RDPMC:
27203 // Read Processor Register.
27204 case RDPRU:
27205 // GetExtended Control Register.
27206 case XGETBV: {
27208
27209 // RDPMC uses ECX to select the index of the performance counter to read.
27210 // RDPRU uses ECX to select the processor register to read.
27211 // XGETBV uses ECX to select the index of the XCR register to return.
27212 // The result is stored into registers EDX:EAX.
27213 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27214 Subtarget, Results);
27215 return DAG.getMergeValues(Results, dl);
27216 }
27217 // XTEST intrinsics.
27218 case XTEST: {
27219 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27220 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27221
27222 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27223 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27224 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27225 Ret, SDValue(InTrans.getNode(), 1));
27226 }
27229 case TRUNCATE_TO_MEM_VI32: {
27230 SDValue Mask = Op.getOperand(4);
27231 SDValue DataToTruncate = Op.getOperand(3);
27232 SDValue Addr = Op.getOperand(2);
27233 SDValue Chain = Op.getOperand(0);
27234
27235 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27236 assert(MemIntr && "Expected MemIntrinsicSDNode!");
27237
27238 EVT MemVT = MemIntr->getMemoryVT();
27239
27240 uint16_t TruncationOp = IntrData->Opc0;
27241 switch (TruncationOp) {
27242 case X86ISD::VTRUNC: {
27243 if (isAllOnesConstant(Mask)) // return just a truncate store
27244 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27245 MemIntr->getMemOperand());
27246
27247 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27248 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27249 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27250
27251 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27252 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27253 true /* truncating */);
27254 }
27255 case X86ISD::VTRUNCUS:
27256 case X86ISD::VTRUNCS: {
27257 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27258 if (isAllOnesConstant(Mask))
27259 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27260 MemIntr->getMemOperand(), DAG);
27261
27262 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27263 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27264
27265 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27266 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27267 }
27268 default:
27269 llvm_unreachable("Unsupported truncstore intrinsic");
27270 }
27271 }
27272 }
27273}
27274
27275SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27276 SelectionDAG &DAG) const {
27278 MFI.setReturnAddressIsTaken(true);
27279
27281 return SDValue();
27282
27283 unsigned Depth = Op.getConstantOperandVal(0);
27284 SDLoc dl(Op);
27285 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27286
27287 if (Depth > 0) {
27288 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27289 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27290 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27291 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27292 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27294 }
27295
27296 // Just load the return address.
27297 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27298 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27300}
27301
27302SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27303 SelectionDAG &DAG) const {
27305 return getReturnAddressFrameIndex(DAG);
27306}
27307
27308SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27310 MachineFrameInfo &MFI = MF.getFrameInfo();
27312 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27313 EVT VT = Op.getValueType();
27314
27315 MFI.setFrameAddressIsTaken(true);
27316
27317 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27318 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27319 // is not possible to crawl up the stack without looking at the unwind codes
27320 // simultaneously.
27321 int FrameAddrIndex = FuncInfo->getFAIndex();
27322 if (!FrameAddrIndex) {
27323 // Set up a frame object for the return address.
27324 unsigned SlotSize = RegInfo->getSlotSize();
27325 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27326 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27327 FuncInfo->setFAIndex(FrameAddrIndex);
27328 }
27329 return DAG.getFrameIndex(FrameAddrIndex, VT);
27330 }
27331
27332 unsigned FrameReg =
27333 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27334 SDLoc dl(Op); // FIXME probably not meaningful
27335 unsigned Depth = Op.getConstantOperandVal(0);
27336 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
27337 (FrameReg == X86::EBP && VT == MVT::i32)) &&
27338 "Invalid Frame Register!");
27339 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27340 while (Depth--)
27341 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27343 return FrameAddr;
27344}
27345
27346// FIXME? Maybe this could be a TableGen attribute on some registers and
27347// this table could be generated automatically from RegInfo.
27349 const MachineFunction &MF) const {
27350 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27351
27353 .Case("esp", X86::ESP)
27354 .Case("rsp", X86::RSP)
27355 .Case("ebp", X86::EBP)
27356 .Case("rbp", X86::RBP)
27357 .Case("r14", X86::R14)
27358 .Case("r15", X86::R15)
27359 .Default(0);
27360
27361 if (Reg == X86::EBP || Reg == X86::RBP) {
27362 if (!TFI.hasFP(MF))
27363 report_fatal_error("register " + StringRef(RegName) +
27364 " is allocatable: function has no frame pointer");
27365#ifndef NDEBUG
27366 else {
27367 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27368 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27369 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
27370 "Invalid Frame Register!");
27371 }
27372#endif
27373 }
27374
27375 if (Reg)
27376 return Reg;
27377
27378 report_fatal_error("Invalid register name global variable");
27379}
27380
27381SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27382 SelectionDAG &DAG) const {
27383 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27384 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27385}
27386
27388 const Constant *PersonalityFn) const {
27389 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27390 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27391
27392 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27393}
27394
27396 const Constant *PersonalityFn) const {
27397 // Funclet personalities don't use selectors (the runtime does the selection).
27399 return X86::NoRegister;
27400 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27401}
27402
27404 return Subtarget.isTargetWin64();
27405}
27406
27407SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27408 SDValue Chain = Op.getOperand(0);
27409 SDValue Offset = Op.getOperand(1);
27410 SDValue Handler = Op.getOperand(2);
27411 SDLoc dl (Op);
27412
27413 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27414 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27415 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27416 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
27417 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
27418 "Invalid Frame Register!");
27419 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27420 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27421
27422 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27423 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27424 dl));
27425 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27426 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27427 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27428
27429 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27430 DAG.getRegister(StoreAddrReg, PtrVT));
27431}
27432
27433SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27434 SelectionDAG &DAG) const {
27435 SDLoc DL(Op);
27436 // If the subtarget is not 64bit, we may need the global base reg
27437 // after isel expand pseudo, i.e., after CGBR pass ran.
27438 // Therefore, ask for the GlobalBaseReg now, so that the pass
27439 // inserts the code for us in case we need it.
27440 // Otherwise, we will end up in a situation where we will
27441 // reference a virtual register that is not defined!
27442 if (!Subtarget.is64Bit()) {
27443 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27444 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27445 }
27446 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27447 DAG.getVTList(MVT::i32, MVT::Other),
27448 Op.getOperand(0), Op.getOperand(1));
27449}
27450
27451SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27452 SelectionDAG &DAG) const {
27453 SDLoc DL(Op);
27454 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27455 Op.getOperand(0), Op.getOperand(1));
27456}
27457
27458SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27459 SelectionDAG &DAG) const {
27460 SDLoc DL(Op);
27461 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27462 Op.getOperand(0));
27463}
27464
27466 return Op.getOperand(0);
27467}
27468
27469SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27470 SelectionDAG &DAG) const {
27471 SDValue Root = Op.getOperand(0);
27472 SDValue Trmp = Op.getOperand(1); // trampoline
27473 SDValue FPtr = Op.getOperand(2); // nested function
27474 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27475 SDLoc dl (Op);
27476
27477 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27478 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27479
27480 if (Subtarget.is64Bit()) {
27481 SDValue OutChains[6];
27482
27483 // Large code-model.
27484 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27485 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27486
27487 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27488 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27489
27490 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27491
27492 // Load the pointer to the nested function into R11.
27493 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27494 SDValue Addr = Trmp;
27495 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27496 Addr, MachinePointerInfo(TrmpAddr));
27497
27498 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27499 DAG.getConstant(2, dl, MVT::i64));
27500 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27501 MachinePointerInfo(TrmpAddr, 2), Align(2));
27502
27503 // Load the 'nest' parameter value into R10.
27504 // R10 is specified in X86CallingConv.td
27505 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27506 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27507 DAG.getConstant(10, dl, MVT::i64));
27508 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27509 Addr, MachinePointerInfo(TrmpAddr, 10));
27510
27511 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27512 DAG.getConstant(12, dl, MVT::i64));
27513 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27514 MachinePointerInfo(TrmpAddr, 12), Align(2));
27515
27516 // Jump to the nested function.
27517 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27518 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27519 DAG.getConstant(20, dl, MVT::i64));
27520 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27521 Addr, MachinePointerInfo(TrmpAddr, 20));
27522
27523 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27524 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27525 DAG.getConstant(22, dl, MVT::i64));
27526 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27527 Addr, MachinePointerInfo(TrmpAddr, 22));
27528
27529 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27530 } else {
27531 const Function *Func =
27532 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27533 CallingConv::ID CC = Func->getCallingConv();
27534 unsigned NestReg;
27535
27536 switch (CC) {
27537 default:
27538 llvm_unreachable("Unsupported calling convention");
27539 case CallingConv::C:
27541 // Pass 'nest' parameter in ECX.
27542 // Must be kept in sync with X86CallingConv.td
27543 NestReg = X86::ECX;
27544
27545 // Check that ECX wasn't needed by an 'inreg' parameter.
27546 FunctionType *FTy = Func->getFunctionType();
27547 const AttributeList &Attrs = Func->getAttributes();
27548
27549 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27550 unsigned InRegCount = 0;
27551 unsigned Idx = 0;
27552
27553 for (FunctionType::param_iterator I = FTy->param_begin(),
27554 E = FTy->param_end(); I != E; ++I, ++Idx)
27555 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
27556 const DataLayout &DL = DAG.getDataLayout();
27557 // FIXME: should only count parameters that are lowered to integers.
27558 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27559 }
27560
27561 if (InRegCount > 2) {
27562 report_fatal_error("Nest register in use - reduce number of inreg"
27563 " parameters!");
27564 }
27565 }
27566 break;
27567 }
27570 case CallingConv::Fast:
27571 case CallingConv::Tail:
27573 // Pass 'nest' parameter in EAX.
27574 // Must be kept in sync with X86CallingConv.td
27575 NestReg = X86::EAX;
27576 break;
27577 }
27578
27579 SDValue OutChains[4];
27580 SDValue Addr, Disp;
27581
27582 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27583 DAG.getConstant(10, dl, MVT::i32));
27584 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27585
27586 // This is storing the opcode for MOV32ri.
27587 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27588 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27589 OutChains[0] =
27590 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27591 Trmp, MachinePointerInfo(TrmpAddr));
27592
27593 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27594 DAG.getConstant(1, dl, MVT::i32));
27595 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27596 MachinePointerInfo(TrmpAddr, 1), Align(1));
27597
27598 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27599 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27600 DAG.getConstant(5, dl, MVT::i32));
27601 OutChains[2] =
27602 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27603 MachinePointerInfo(TrmpAddr, 5), Align(1));
27604
27605 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27606 DAG.getConstant(6, dl, MVT::i32));
27607 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27608 MachinePointerInfo(TrmpAddr, 6), Align(1));
27609
27610 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27611 }
27612}
27613
27614SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
27615 SelectionDAG &DAG) const {
27616 /*
27617 The rounding mode is in bits 11:10 of FPSR, and has the following
27618 settings:
27619 00 Round to nearest
27620 01 Round to -inf
27621 10 Round to +inf
27622 11 Round to 0
27623
27624 GET_ROUNDING, on the other hand, expects the following:
27625 -1 Undefined
27626 0 Round to 0
27627 1 Round to nearest
27628 2 Round to +inf
27629 3 Round to -inf
27630
27631 To perform the conversion, we use a packed lookup table of the four 2-bit
27632 values that we can index by FPSP[11:10]
27633 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27634
27635 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27636 */
27637
27639 MVT VT = Op.getSimpleValueType();
27640 SDLoc DL(Op);
27641
27642 // Save FP Control Word to stack slot
27643 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27644 SDValue StackSlot =
27645 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27646
27648
27649 SDValue Chain = Op.getOperand(0);
27650 SDValue Ops[] = {Chain, StackSlot};
27652 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27654
27655 // Load FP Control Word from stack slot
27656 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27657 Chain = CWD.getValue(1);
27658
27659 // Mask and turn the control bits into a shift for the lookup table.
27660 SDValue Shift =
27661 DAG.getNode(ISD::SRL, DL, MVT::i16,
27662 DAG.getNode(ISD::AND, DL, MVT::i16,
27663 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27664 DAG.getConstant(9, DL, MVT::i8));
27665 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27666
27667 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27668 SDValue RetVal =
27669 DAG.getNode(ISD::AND, DL, MVT::i32,
27670 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27671 DAG.getConstant(3, DL, MVT::i32));
27672
27673 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27674
27675 return DAG.getMergeValues({RetVal, Chain}, DL);
27676}
27677
27678SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27679 SelectionDAG &DAG) const {
27681 SDLoc DL(Op);
27682 SDValue Chain = Op.getNode()->getOperand(0);
27683
27684 // FP control word may be set only from data in memory. So we need to allocate
27685 // stack space to save/load FP control word.
27686 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27687 SDValue StackSlot =
27688 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27690 MachineMemOperand *MMO =
27692
27693 // Store FP control word into memory.
27694 SDValue Ops[] = {Chain, StackSlot};
27695 Chain = DAG.getMemIntrinsicNode(
27696 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27697
27698 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27699 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27700 Chain = CWD.getValue(1);
27701 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27702 DAG.getConstant(0xf3ff, DL, MVT::i16));
27703
27704 // Calculate new rounding mode.
27705 SDValue NewRM = Op.getNode()->getOperand(1);
27706 SDValue RMBits;
27707 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27708 uint64_t RM = CVal->getZExtValue();
27709 int FieldVal;
27710 switch (static_cast<RoundingMode>(RM)) {
27711 // clang-format off
27712 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27713 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27714 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27715 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27716 default:
27717 llvm_unreachable("rounding mode is not supported by X86 hardware");
27718 // clang-format on
27719 }
27720 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27721 } else {
27722 // Need to convert argument into bits of control word:
27723 // 0 Round to 0 -> 11
27724 // 1 Round to nearest -> 00
27725 // 2 Round to +inf -> 10
27726 // 3 Round to -inf -> 01
27727 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27728 // To make the conversion, put all these values into a value 0xc9 and shift
27729 // it left depending on the rounding mode:
27730 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27731 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27732 // ...
27733 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27734 SDValue ShiftValue =
27735 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27736 DAG.getNode(ISD::ADD, DL, MVT::i32,
27737 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27738 DAG.getConstant(1, DL, MVT::i8)),
27739 DAG.getConstant(4, DL, MVT::i32)));
27740 SDValue Shifted =
27741 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27742 ShiftValue);
27743 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27744 DAG.getConstant(0xc00, DL, MVT::i16));
27745 }
27746
27747 // Update rounding mode bits and store the new FP Control Word into stack.
27748 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27749 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
27750
27751 // Load FP control word from the slot.
27752 SDValue OpsLD[] = {Chain, StackSlot};
27753 MachineMemOperand *MMOL =
27755 Chain = DAG.getMemIntrinsicNode(
27756 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27757
27758 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27759 // same way but in bits 14:13.
27760 if (Subtarget.hasSSE1()) {
27761 // Store MXCSR into memory.
27762 Chain = DAG.getNode(
27763 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27764 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27765 StackSlot);
27766
27767 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27768 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27769 Chain = CWD.getValue(1);
27770 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27771 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27772
27773 // Shift X87 RM bits from 11:10 to 14:13.
27774 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27775 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27776 DAG.getConstant(3, DL, MVT::i8));
27777
27778 // Update rounding mode bits and store the new FP Control Word into stack.
27779 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27780 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
27781
27782 // Load MXCSR from the slot.
27783 Chain = DAG.getNode(
27784 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27785 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27786 StackSlot);
27787 }
27788
27789 return Chain;
27790}
27791
27792const unsigned X87StateSize = 28;
27793const unsigned FPStateSize = 32;
27794[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
27795
27796SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
27797 SelectionDAG &DAG) const {
27799 SDLoc DL(Op);
27800 SDValue Chain = Op->getOperand(0);
27801 SDValue Ptr = Op->getOperand(1);
27802 auto *Node = cast<FPStateAccessSDNode>(Op);
27803 EVT MemVT = Node->getMemoryVT();
27805 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27806
27807 // Get x87 state, if it presents.
27808 if (Subtarget.hasX87()) {
27809 Chain =
27810 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
27811 {Chain, Ptr}, MemVT, MMO);
27812
27813 // FNSTENV changes the exception mask, so load back the stored environment.
27814 MachineMemOperand::Flags NewFlags =
27816 (MMO->getFlags() & ~MachineMemOperand::MOStore);
27817 MMO = MF.getMachineMemOperand(MMO, NewFlags);
27818 Chain =
27819 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27820 {Chain, Ptr}, MemVT, MMO);
27821 }
27822
27823 // If target supports SSE, get MXCSR as well.
27824 if (Subtarget.hasSSE1()) {
27825 // Get pointer to the MXCSR location in memory.
27827 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27828 DAG.getConstant(X87StateSize, DL, PtrVT));
27829 // Store MXCSR into memory.
27830 Chain = DAG.getNode(
27831 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27832 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27833 MXCSRAddr);
27834 }
27835
27836 return Chain;
27837}
27838
27840 EVT MemVT, MachineMemOperand *MMO,
27841 SelectionDAG &DAG,
27842 const X86Subtarget &Subtarget) {
27843 // Set x87 state, if it presents.
27844 if (Subtarget.hasX87())
27845 Chain =
27846 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27847 {Chain, Ptr}, MemVT, MMO);
27848 // If target supports SSE, set MXCSR as well.
27849 if (Subtarget.hasSSE1()) {
27850 // Get pointer to the MXCSR location in memory.
27852 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27853 DAG.getConstant(X87StateSize, DL, PtrVT));
27854 // Load MXCSR from memory.
27855 Chain = DAG.getNode(
27856 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27857 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27858 MXCSRAddr);
27859 }
27860 return Chain;
27861}
27862
27863SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
27864 SelectionDAG &DAG) const {
27865 SDLoc DL(Op);
27866 SDValue Chain = Op->getOperand(0);
27867 SDValue Ptr = Op->getOperand(1);
27868 auto *Node = cast<FPStateAccessSDNode>(Op);
27869 EVT MemVT = Node->getMemoryVT();
27871 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27872 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
27873}
27874
27875SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
27876 SelectionDAG &DAG) const {
27878 SDLoc DL(Op);
27879 SDValue Chain = Op.getNode()->getOperand(0);
27880
27881 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
27882 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
27884
27885 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
27886 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
27887 // for compatibility with glibc.
27888 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
27889 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
27890 Constant *Zero = ConstantInt::get(ItemTy, 0);
27891 for (unsigned I = 0; I < 6; ++I)
27892 FPEnvVals.push_back(Zero);
27893
27894 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
27895 // all exceptions, sets DAZ and FTZ to 0.
27896 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
27897 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
27899 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
27900 MachinePointerInfo MPI =
27904
27905 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
27906}
27907
27908/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27909//
27910// i8/i16 vector implemented using dword LZCNT vector instruction
27911// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27912// split the vector, perform operation on it's Lo a Hi part and
27913// concatenate the results.
27915 const X86Subtarget &Subtarget) {
27916 assert(Op.getOpcode() == ISD::CTLZ);
27917 SDLoc dl(Op);
27918 MVT VT = Op.getSimpleValueType();
27919 MVT EltVT = VT.getVectorElementType();
27920 unsigned NumElems = VT.getVectorNumElements();
27921
27922 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27923 "Unsupported element type");
27924
27925 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27926 if (NumElems > 16 ||
27927 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27928 return splitVectorIntUnary(Op, DAG, dl);
27929
27930 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27931 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27932 "Unsupported value type for operation");
27933
27934 // Use native supported vector instruction vplzcntd.
27935 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27936 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27937 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27938 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27939
27940 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27941}
27942
27943// Lower CTLZ using a PSHUFB lookup table implementation.
27945 const X86Subtarget &Subtarget,
27946 SelectionDAG &DAG) {
27947 MVT VT = Op.getSimpleValueType();
27948 int NumElts = VT.getVectorNumElements();
27949 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27950 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27951
27952 // Per-nibble leading zero PSHUFB lookup table.
27953 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27954 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27955 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27956 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27957
27959 for (int i = 0; i < NumBytes; ++i)
27960 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27961 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27962
27963 // Begin by bitcasting the input to byte vector, then split those bytes
27964 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
27965 // If the hi input nibble is zero then we add both results together, otherwise
27966 // we just take the hi result (by masking the lo result to zero before the
27967 // add).
27968 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27969 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27970
27971 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27972 SDValue Lo = Op0;
27973 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27974 SDValue HiZ;
27975 if (CurrVT.is512BitVector()) {
27976 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27977 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27978 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27979 } else {
27980 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27981 }
27982
27983 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27984 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27985 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27986 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27987
27988 // Merge result back from vXi8 back to VT, working on the lo/hi halves
27989 // of the current vector width in the same way we did for the nibbles.
27990 // If the upper half of the input element is zero then add the halves'
27991 // leading zero counts together, otherwise just use the upper half's.
27992 // Double the width of the result until we are at target width.
27993 while (CurrVT != VT) {
27994 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27995 int CurrNumElts = CurrVT.getVectorNumElements();
27996 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27997 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27998 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27999
28000 // Check if the upper half of the input element is zero.
28001 if (CurrVT.is512BitVector()) {
28002 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28003 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
28004 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28005 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28006 } else {
28007 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
28008 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28009 }
28010 HiZ = DAG.getBitcast(NextVT, HiZ);
28011
28012 // Move the upper/lower halves to the lower bits as we'll be extending to
28013 // NextVT. Mask the lower result to zero if HiZ is true and add the results
28014 // together.
28015 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
28016 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
28017 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
28018 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
28019 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
28020 CurrVT = NextVT;
28021 }
28022
28023 return Res;
28024}
28025
28027 const X86Subtarget &Subtarget,
28028 SelectionDAG &DAG) {
28029 MVT VT = Op.getSimpleValueType();
28030
28031 if (Subtarget.hasCDI() &&
28032 // vXi8 vectors need to be promoted to 512-bits for vXi32.
28033 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
28034 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
28035
28036 // Decompose 256-bit ops into smaller 128-bit ops.
28037 if (VT.is256BitVector() && !Subtarget.hasInt256())
28038 return splitVectorIntUnary(Op, DAG, DL);
28039
28040 // Decompose 512-bit ops into smaller 256-bit ops.
28041 if (VT.is512BitVector() && !Subtarget.hasBWI())
28042 return splitVectorIntUnary(Op, DAG, DL);
28043
28044 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
28045 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
28046}
28047
28048static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
28049 SelectionDAG &DAG) {
28050 MVT VT = Op.getSimpleValueType();
28051 MVT OpVT = VT;
28052 unsigned NumBits = VT.getSizeInBits();
28053 SDLoc dl(Op);
28054 unsigned Opc = Op.getOpcode();
28055
28056 if (VT.isVector())
28057 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
28058
28059 Op = Op.getOperand(0);
28060 if (VT == MVT::i8) {
28061 // Zero extend to i32 since there is not an i8 bsr.
28062 OpVT = MVT::i32;
28063 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
28064 }
28065
28066 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28067 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28068 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
28069
28070 if (Opc == ISD::CTLZ) {
28071 // If src is zero (i.e. bsr sets ZF), returns NumBits.
28072 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28073 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28074 Op.getValue(1)};
28075 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28076 }
28077
28078 // Finally xor with NumBits-1.
28079 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28080 DAG.getConstant(NumBits - 1, dl, OpVT));
28081
28082 if (VT == MVT::i8)
28083 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28084 return Op;
28085}
28086
28087static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28088 SelectionDAG &DAG) {
28089 MVT VT = Op.getSimpleValueType();
28090 unsigned NumBits = VT.getScalarSizeInBits();
28091 SDValue N0 = Op.getOperand(0);
28092 SDLoc dl(Op);
28093
28094 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
28095 "Only scalar CTTZ requires custom lowering");
28096
28097 // Issue a bsf (scan bits forward) which also sets EFLAGS.
28098 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28099 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
28100
28101 // If src is known never zero we can skip the CMOV.
28102 if (DAG.isKnownNeverZero(N0))
28103 return Op;
28104
28105 // If src is zero (i.e. bsf sets ZF), returns NumBits.
28106 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28107 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28108 Op.getValue(1)};
28109 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28110}
28111
28113 const X86Subtarget &Subtarget) {
28114 MVT VT = Op.getSimpleValueType();
28115 SDLoc DL(Op);
28116
28117 if (VT == MVT::i16 || VT == MVT::i32)
28118 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
28119
28120 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28121 return splitVectorIntBinary(Op, DAG, DL);
28122
28123 assert(Op.getSimpleValueType().is256BitVector() &&
28124 Op.getSimpleValueType().isInteger() &&
28125 "Only handle AVX 256-bit vector integer operation");
28126 return splitVectorIntBinary(Op, DAG, DL);
28127}
28128
28130 const X86Subtarget &Subtarget) {
28131 MVT VT = Op.getSimpleValueType();
28132 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28133 unsigned Opcode = Op.getOpcode();
28134 SDLoc DL(Op);
28135
28136 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28137 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28138 assert(Op.getSimpleValueType().isInteger() &&
28139 "Only handle AVX vector integer operation");
28140 return splitVectorIntBinary(Op, DAG, DL);
28141 }
28142
28143 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28144 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28145 EVT SetCCResultType =
28146 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28147
28148 unsigned BitWidth = VT.getScalarSizeInBits();
28149 if (Opcode == ISD::USUBSAT) {
28150 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28151 // Handle a special-case with a bit-hack instead of cmp+select:
28152 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28153 // If the target can use VPTERNLOG, DAGToDAG will match this as
28154 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28155 // "broadcast" constant load.
28157 if (C && C->getAPIntValue().isSignMask()) {
28158 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28159 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28160 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28161 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28162 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28163 }
28164 }
28165 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28166 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28167 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28168 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28169 // TODO: Move this to DAGCombiner?
28170 if (SetCCResultType == VT &&
28171 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28172 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28173 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28174 }
28175 }
28176
28177 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28178 (!VT.isVector() || VT == MVT::v2i64)) {
28181 SDValue Zero = DAG.getConstant(0, DL, VT);
28182 SDValue Result =
28183 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28184 DAG.getVTList(VT, SetCCResultType), X, Y);
28185 SDValue SumDiff = Result.getValue(0);
28186 SDValue Overflow = Result.getValue(1);
28187 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28188 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28189 SDValue SumNeg =
28190 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28191 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28192 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28193 }
28194
28195 // Use default expansion.
28196 return SDValue();
28197}
28198
28199static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28200 SelectionDAG &DAG) {
28201 MVT VT = Op.getSimpleValueType();
28202 SDLoc DL(Op);
28203
28204 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28205 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28206 // 8-bit integer abs to NEG and CMOV.
28207 SDValue N0 = Op.getOperand(0);
28208 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28209 DAG.getConstant(0, DL, VT), N0);
28210 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28211 SDValue(Neg.getNode(), 1)};
28212 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28213 }
28214
28215 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28216 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28217 SDValue Src = Op.getOperand(0);
28218 SDValue Neg = DAG.getNegative(Src, DL, VT);
28219 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
28220 }
28221
28222 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28223 assert(VT.isInteger() &&
28224 "Only handle AVX 256-bit vector integer operation");
28225 return splitVectorIntUnary(Op, DAG, DL);
28226 }
28227
28228 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28229 return splitVectorIntUnary(Op, DAG, DL);
28230
28231 // Default to expand.
28232 return SDValue();
28233}
28234
28235static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
28236 SelectionDAG &DAG) {
28237 MVT VT = Op.getSimpleValueType();
28238 SDLoc DL(Op);
28239
28240 // For AVX1 cases, split to use legal ops.
28241 if (VT.is256BitVector() && !Subtarget.hasInt256())
28242 return splitVectorIntBinary(Op, DAG, DL);
28243
28244 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28245 return splitVectorIntBinary(Op, DAG, DL);
28246
28247 // Default to expand.
28248 return SDValue();
28249}
28250
28251static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
28252 SelectionDAG &DAG) {
28253 MVT VT = Op.getSimpleValueType();
28254 SDLoc DL(Op);
28255
28256 // For AVX1 cases, split to use legal ops.
28257 if (VT.is256BitVector() && !Subtarget.hasInt256())
28258 return splitVectorIntBinary(Op, DAG, DL);
28259
28260 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28261 return splitVectorIntBinary(Op, DAG, DL);
28262
28263 // Default to expand.
28264 return SDValue();
28265}
28266
28268 SelectionDAG &DAG) {
28269 assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
28270 "Expected FMAXIMUM or FMINIMUM opcode");
28271 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28272 EVT VT = Op.getValueType();
28273 SDValue X = Op.getOperand(0);
28274 SDValue Y = Op.getOperand(1);
28275 SDLoc DL(Op);
28276 uint64_t SizeInBits = VT.getScalarSizeInBits();
28277 APInt PreferredZero = APInt::getZero(SizeInBits);
28278 APInt OppositeZero = PreferredZero;
28279 EVT IVT = VT.changeTypeToInteger();
28280 X86ISD::NodeType MinMaxOp;
28281 if (Op.getOpcode() == ISD::FMAXIMUM) {
28282 MinMaxOp = X86ISD::FMAX;
28283 OppositeZero.setSignBit();
28284 } else {
28285 PreferredZero.setSignBit();
28286 MinMaxOp = X86ISD::FMIN;
28287 }
28288 EVT SetCCType =
28289 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28290
28291 // The tables below show the expected result of Max in cases of NaN and
28292 // signed zeros.
28293 //
28294 // Y Y
28295 // Num xNaN +0 -0
28296 // --------------- ---------------
28297 // Num | Max | Y | +0 | +0 | +0 |
28298 // X --------------- X ---------------
28299 // xNaN | X | X/Y | -0 | +0 | -0 |
28300 // --------------- ---------------
28301 //
28302 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
28303 // reordering.
28304 //
28305 // We check if any of operands is NaN and return NaN. Then we check if any of
28306 // operands is zero or negative zero (for fmaximum and fminimum respectively)
28307 // to ensure the correct zero is returned.
28308 auto MatchesZero = [](SDValue Op, APInt Zero) {
28310 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
28311 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
28312 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
28313 return CstOp->getAPIntValue() == Zero;
28314 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
28315 Op->getOpcode() == ISD::SPLAT_VECTOR) {
28316 for (const SDValue &OpVal : Op->op_values()) {
28317 if (OpVal.isUndef())
28318 continue;
28319 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
28320 if (!CstOp)
28321 return false;
28322 if (!CstOp->getValueAPF().isZero())
28323 continue;
28324 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
28325 return false;
28326 }
28327 return true;
28328 }
28329 return false;
28330 };
28331
28332 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
28333 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
28334 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
28335 Op->getFlags().hasNoSignedZeros() ||
28336 DAG.isKnownNeverZeroFloat(X) ||
28338 SDValue NewX, NewY;
28339 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
28340 MatchesZero(X, OppositeZero)) {
28341 // Operands are already in right order or order does not matter.
28342 NewX = X;
28343 NewY = Y;
28344 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
28345 NewX = Y;
28346 NewY = X;
28347 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
28348 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
28349 if (IsXNeverNaN)
28350 std::swap(X, Y);
28351 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
28352 // xmm register.
28353 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
28355 // Bits of classes:
28356 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
28357 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
28358 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
28359 DL, MVT::i32);
28360 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
28361 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
28362 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
28363 DAG.getIntPtrConstant(0, DL));
28364 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
28365 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
28366 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
28367 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28368 } else {
28369 SDValue IsXSigned;
28370 if (Subtarget.is64Bit() || VT != MVT::f64) {
28371 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
28372 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
28373 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
28374 } else {
28375 assert(VT == MVT::f64);
28376 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
28377 DAG.getConstantFP(0, DL, MVT::v2f64), X,
28378 DAG.getIntPtrConstant(0, DL));
28379 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
28380 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
28381 DAG.getIntPtrConstant(1, DL));
28382 Hi = DAG.getBitcast(MVT::i32, Hi);
28383 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
28384 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
28385 *DAG.getContext(), MVT::i32);
28386 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
28387 }
28388 if (MinMaxOp == X86ISD::FMAX) {
28389 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28390 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28391 } else {
28392 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28393 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28394 }
28395 }
28396
28397 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
28398 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
28399
28400 // If we did no ordering operands for signed zero handling and we need
28401 // to process NaN and we know that the second operand is not NaN then put
28402 // it in first operand and we will not need to post handle NaN after max/min.
28403 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
28404 std::swap(NewX, NewY);
28405
28406 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28407
28408 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
28409 return MinMax;
28410
28411 SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
28412 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
28413}
28414
28415static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
28416 SelectionDAG &DAG) {
28417 MVT VT = Op.getSimpleValueType();
28418 SDLoc dl(Op);
28419
28420 // For AVX1 cases, split to use legal ops.
28421 if (VT.is256BitVector() && !Subtarget.hasInt256())
28422 return splitVectorIntBinary(Op, DAG, dl);
28423
28424 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
28425 return splitVectorIntBinary(Op, DAG, dl);
28426
28427 bool IsSigned = Op.getOpcode() == ISD::ABDS;
28428 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28429
28430 // TODO: Move to TargetLowering expandABD() once we have ABD promotion.
28431 if (VT.isScalarInteger()) {
28432 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
28433 MVT WideVT = MVT::getIntegerVT(WideBits);
28434 if (TLI.isTypeLegal(WideVT)) {
28435 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
28436 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
28437 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28438 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
28439 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
28440 SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);
28441 SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);
28442 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
28443 }
28444 }
28445
28446 // Default to expand.
28447 return SDValue();
28448}
28449
28450static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28451 SelectionDAG &DAG) {
28452 SDLoc dl(Op);
28453 MVT VT = Op.getSimpleValueType();
28454
28455 // Decompose 256-bit ops into 128-bit ops.
28456 if (VT.is256BitVector() && !Subtarget.hasInt256())
28457 return splitVectorIntBinary(Op, DAG, dl);
28458
28459 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28460 return splitVectorIntBinary(Op, DAG, dl);
28461
28462 SDValue A = Op.getOperand(0);
28463 SDValue B = Op.getOperand(1);
28464
28465 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28466 // vector pairs, multiply and truncate.
28467 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28468 unsigned NumElts = VT.getVectorNumElements();
28469 unsigned NumLanes = VT.getSizeInBits() / 128;
28470 unsigned NumEltsPerLane = NumElts / NumLanes;
28471
28472 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28473 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28474 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28475 return DAG.getNode(
28476 ISD::TRUNCATE, dl, VT,
28477 DAG.getNode(ISD::MUL, dl, ExVT,
28478 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28479 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28480 }
28481
28482 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28483
28484 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
28485 // Don't do this if we only need to unpack one half.
28486 if (Subtarget.hasSSSE3()) {
28487 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
28488 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
28489 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
28490 if (BIsBuildVector) {
28491 for (auto [Idx, Val] : enumerate(B->ops())) {
28492 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
28493 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
28494 else
28495 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
28496 }
28497 }
28498 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
28499 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
28500 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
28501 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
28502 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
28503 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
28504 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
28505 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
28506 DAG.getTargetConstant(8, dl, MVT::i8));
28507 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
28508 }
28509 }
28510
28511 // Extract the lo/hi parts to any extend to i16.
28512 // We're going to mask off the low byte of each result element of the
28513 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28514 // element.
28515 SDValue Undef = DAG.getUNDEF(VT);
28516 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28517 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28518
28519 SDValue BLo, BHi;
28520 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28521 // If the RHS is a constant, manually unpackl/unpackh.
28522 SmallVector<SDValue, 16> LoOps, HiOps;
28523 for (unsigned i = 0; i != NumElts; i += 16) {
28524 for (unsigned j = 0; j != 8; ++j) {
28525 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28526 MVT::i16));
28527 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28528 MVT::i16));
28529 }
28530 }
28531
28532 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28533 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28534 } else {
28535 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28536 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28537 }
28538
28539 // Multiply, mask the lower 8bits of the lo/hi results and pack.
28540 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28541 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28542 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28543 }
28544
28545 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28546 if (VT == MVT::v4i32) {
28547 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
28548 "Should not custom lower when pmulld is available!");
28549
28550 // Extract the odd parts.
28551 static const int UnpackMask[] = { 1, -1, 3, -1 };
28552 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28553 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28554
28555 // Multiply the even parts.
28556 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28557 DAG.getBitcast(MVT::v2i64, A),
28558 DAG.getBitcast(MVT::v2i64, B));
28559 // Now multiply odd parts.
28560 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28561 DAG.getBitcast(MVT::v2i64, Aodds),
28562 DAG.getBitcast(MVT::v2i64, Bodds));
28563
28564 Evens = DAG.getBitcast(VT, Evens);
28565 Odds = DAG.getBitcast(VT, Odds);
28566
28567 // Merge the two vectors back together with a shuffle. This expands into 2
28568 // shuffles.
28569 static const int ShufMask[] = { 0, 4, 2, 6 };
28570 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28571 }
28572
28573 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
28574 "Only know how to lower V2I64/V4I64/V8I64 multiply");
28575 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
28576
28577 // Ahi = psrlqi(a, 32);
28578 // Bhi = psrlqi(b, 32);
28579 //
28580 // AloBlo = pmuludq(a, b);
28581 // AloBhi = pmuludq(a, Bhi);
28582 // AhiBlo = pmuludq(Ahi, b);
28583 //
28584 // Hi = psllqi(AloBhi + AhiBlo, 32);
28585 // return AloBlo + Hi;
28586 KnownBits AKnown = DAG.computeKnownBits(A);
28587 KnownBits BKnown = DAG.computeKnownBits(B);
28588
28589 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28590 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28591 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28592
28593 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28594 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28595 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28596
28597 SDValue Zero = DAG.getConstant(0, dl, VT);
28598
28599 // Only multiply lo/hi halves that aren't known to be zero.
28600 SDValue AloBlo = Zero;
28601 if (!ALoIsZero && !BLoIsZero)
28602 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28603
28604 SDValue AloBhi = Zero;
28605 if (!ALoIsZero && !BHiIsZero) {
28606 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28607 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28608 }
28609
28610 SDValue AhiBlo = Zero;
28611 if (!AHiIsZero && !BLoIsZero) {
28612 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28613 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28614 }
28615
28616 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28617 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28618
28619 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28620}
28621
28623 MVT VT, bool IsSigned,
28624 const X86Subtarget &Subtarget,
28625 SelectionDAG &DAG,
28626 SDValue *Low = nullptr) {
28627 unsigned NumElts = VT.getVectorNumElements();
28628
28629 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28630 // to a vXi16 type. Do the multiplies, shift the results and pack the half
28631 // lane results back together.
28632
28633 // We'll take different approaches for signed and unsigned.
28634 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28635 // and use pmullw to calculate the full 16-bit product.
28636 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28637 // shift them left into the upper byte of each word. This allows us to use
28638 // pmulhw to calculate the full 16-bit product. This trick means we don't
28639 // need to sign extend the bytes to use pmullw.
28640
28641 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28642 SDValue Zero = DAG.getConstant(0, dl, VT);
28643
28644 SDValue ALo, AHi;
28645 if (IsSigned) {
28646 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28647 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28648 } else {
28649 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28650 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28651 }
28652
28653 SDValue BLo, BHi;
28654 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28655 // If the RHS is a constant, manually unpackl/unpackh and extend.
28656 SmallVector<SDValue, 16> LoOps, HiOps;
28657 for (unsigned i = 0; i != NumElts; i += 16) {
28658 for (unsigned j = 0; j != 8; ++j) {
28659 SDValue LoOp = B.getOperand(i + j);
28660 SDValue HiOp = B.getOperand(i + j + 8);
28661
28662 if (IsSigned) {
28663 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28664 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28665 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28666 DAG.getConstant(8, dl, MVT::i16));
28667 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28668 DAG.getConstant(8, dl, MVT::i16));
28669 } else {
28670 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28671 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28672 }
28673
28674 LoOps.push_back(LoOp);
28675 HiOps.push_back(HiOp);
28676 }
28677 }
28678
28679 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28680 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28681 } else if (IsSigned) {
28682 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28683 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28684 } else {
28685 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28686 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28687 }
28688
28689 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28690 // pack back to vXi8.
28691 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28692 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28693 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28694
28695 if (Low)
28696 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28697
28698 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
28699}
28700
28701static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28702 SelectionDAG &DAG) {
28703 SDLoc dl(Op);
28704 MVT VT = Op.getSimpleValueType();
28705 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28706 unsigned NumElts = VT.getVectorNumElements();
28707 SDValue A = Op.getOperand(0);
28708 SDValue B = Op.getOperand(1);
28709
28710 // Decompose 256-bit ops into 128-bit ops.
28711 if (VT.is256BitVector() && !Subtarget.hasInt256())
28712 return splitVectorIntBinary(Op, DAG, dl);
28713
28714 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28715 return splitVectorIntBinary(Op, DAG, dl);
28716
28717 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28718 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
28719 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
28720 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
28721
28722 // PMULxD operations multiply each even value (starting at 0) of LHS with
28723 // the related value of RHS and produce a widen result.
28724 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28725 // => <2 x i64> <ae|cg>
28726 //
28727 // In other word, to have all the results, we need to perform two PMULxD:
28728 // 1. one with the even values.
28729 // 2. one with the odd values.
28730 // To achieve #2, with need to place the odd values at an even position.
28731 //
28732 // Place the odd value at an even position (basically, shift all values 1
28733 // step to the left):
28734 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28735 9, -1, 11, -1, 13, -1, 15, -1};
28736 // <a|b|c|d> => <b|undef|d|undef>
28737 SDValue Odd0 =
28738 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
28739 // <e|f|g|h> => <f|undef|h|undef>
28740 SDValue Odd1 =
28741 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
28742
28743 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28744 // ints.
28745 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28746 unsigned Opcode =
28747 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28748 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28749 // => <2 x i64> <ae|cg>
28750 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28751 DAG.getBitcast(MulVT, A),
28752 DAG.getBitcast(MulVT, B)));
28753 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28754 // => <2 x i64> <bf|dh>
28755 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28756 DAG.getBitcast(MulVT, Odd0),
28757 DAG.getBitcast(MulVT, Odd1)));
28758
28759 // Shuffle it back into the right order.
28760 SmallVector<int, 16> ShufMask(NumElts);
28761 for (int i = 0; i != (int)NumElts; ++i)
28762 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28763
28764 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28765
28766 // If we have a signed multiply but no PMULDQ fix up the result of an
28767 // unsigned multiply.
28768 if (IsSigned && !Subtarget.hasSSE41()) {
28769 SDValue Zero = DAG.getConstant(0, dl, VT);
28770 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28771 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28772 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28773 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28774
28775 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28776 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28777 }
28778
28779 return Res;
28780 }
28781
28782 // Only i8 vectors should need custom lowering after this.
28783 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28784 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28785 "Unsupported vector type");
28786
28787 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28788 // logical shift down the upper half and pack back to i8.
28789
28790 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28791 // and then ashr/lshr the upper bits down to the lower bits before multiply.
28792
28793 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28794 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28795 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28796 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28797 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28798 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28799 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28800 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28801 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28802 }
28803
28804 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28805}
28806
28807// Custom lowering for SMULO/UMULO.
28808static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28809 SelectionDAG &DAG) {
28810 MVT VT = Op.getSimpleValueType();
28811
28812 // Scalars defer to LowerXALUO.
28813 if (!VT.isVector())
28814 return LowerXALUO(Op, DAG);
28815
28816 SDLoc dl(Op);
28817 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28818 SDValue A = Op.getOperand(0);
28819 SDValue B = Op.getOperand(1);
28820 EVT OvfVT = Op->getValueType(1);
28821
28822 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28823 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28824 // Extract the LHS Lo/Hi vectors
28825 SDValue LHSLo, LHSHi;
28826 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28827
28828 // Extract the RHS Lo/Hi vectors
28829 SDValue RHSLo, RHSHi;
28830 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28831
28832 EVT LoOvfVT, HiOvfVT;
28833 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28834 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28835 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28836
28837 // Issue the split operations.
28838 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28839 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28840
28841 // Join the separate data results and the overflow results.
28842 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28843 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28844 Hi.getValue(1));
28845
28846 return DAG.getMergeValues({Res, Ovf}, dl);
28847 }
28848
28849 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28850 EVT SetccVT =
28851 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28852
28853 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28854 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28855 unsigned NumElts = VT.getVectorNumElements();
28856 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28857 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28858 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28859 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28860 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28861
28862 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28863
28864 SDValue Ovf;
28865 if (IsSigned) {
28866 SDValue High, LowSign;
28867 if (OvfVT.getVectorElementType() == MVT::i1 &&
28868 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28869 // Rather the truncating try to do the compare on vXi16 or vXi32.
28870 // Shift the high down filling with sign bits.
28871 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28872 // Fill all 16 bits with the sign bit from the low.
28873 LowSign =
28874 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28875 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28876 15, DAG);
28877 SetccVT = OvfVT;
28878 if (!Subtarget.hasBWI()) {
28879 // We can't do a vXi16 compare so sign extend to v16i32.
28880 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28881 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28882 }
28883 } else {
28884 // Otherwise do the compare at vXi8.
28885 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28886 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28887 LowSign =
28888 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28889 }
28890
28891 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28892 } else {
28893 SDValue High =
28894 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28895 if (OvfVT.getVectorElementType() == MVT::i1 &&
28896 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28897 // Rather the truncating try to do the compare on vXi16 or vXi32.
28898 SetccVT = OvfVT;
28899 if (!Subtarget.hasBWI()) {
28900 // We can't do a vXi16 compare so sign extend to v16i32.
28901 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28902 }
28903 } else {
28904 // Otherwise do the compare at vXi8.
28905 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28906 }
28907
28908 Ovf =
28909 DAG.getSetCC(dl, SetccVT, High,
28910 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28911 }
28912
28913 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28914
28915 return DAG.getMergeValues({Low, Ovf}, dl);
28916 }
28917
28918 SDValue Low;
28919 SDValue High =
28920 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28921
28922 SDValue Ovf;
28923 if (IsSigned) {
28924 // SMULO overflows if the high bits don't match the sign of the low.
28925 SDValue LowSign =
28926 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28927 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28928 } else {
28929 // UMULO overflows if the high bits are non-zero.
28930 Ovf =
28931 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28932 }
28933
28934 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28935
28936 return DAG.getMergeValues({Low, Ovf}, dl);
28937}
28938
28939SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28940 assert(Subtarget.isTargetWin64() && "Unexpected target");
28941 EVT VT = Op.getValueType();
28942 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28943 "Unexpected return type for lowering");
28944
28945 if (isa<ConstantSDNode>(Op->getOperand(1))) {
28947 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
28948 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
28949 }
28950
28951 RTLIB::Libcall LC;
28952 bool isSigned;
28953 switch (Op->getOpcode()) {
28954 // clang-format off
28955 default: llvm_unreachable("Unexpected request for libcall!");
28956 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28957 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28958 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28959 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28960 // clang-format on
28961 }
28962
28963 SDLoc dl(Op);
28964 SDValue InChain = DAG.getEntryNode();
28965
28968 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28969 EVT ArgVT = Op->getOperand(i).getValueType();
28970 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28971 "Unexpected argument type for lowering");
28972 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28973 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28974 MachinePointerInfo MPI =
28976 Entry.Node = StackPtr;
28977 InChain =
28978 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28979 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28980 Entry.Ty = PointerType::get(ArgTy,0);
28981 Entry.IsSExt = false;
28982 Entry.IsZExt = false;
28983 Args.push_back(Entry);
28984 }
28985
28988
28990 CLI.setDebugLoc(dl)
28991 .setChain(InChain)
28992 .setLibCallee(
28994 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28995 std::move(Args))
28996 .setInRegister()
28997 .setSExtResult(isSigned)
28998 .setZExtResult(!isSigned);
28999
29000 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
29001 return DAG.getBitcast(VT, CallInfo.first);
29002}
29003
29004SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
29005 SelectionDAG &DAG,
29006 SDValue &Chain) const {
29007 assert(Subtarget.isTargetWin64() && "Unexpected target");
29008 EVT VT = Op.getValueType();
29009 bool IsStrict = Op->isStrictFPOpcode();
29010
29011 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29012 EVT ArgVT = Arg.getValueType();
29013
29014 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
29015 "Unexpected return type for lowering");
29016
29017 RTLIB::Libcall LC;
29018 if (Op->getOpcode() == ISD::FP_TO_SINT ||
29019 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
29020 LC = RTLIB::getFPTOSINT(ArgVT, VT);
29021 else
29022 LC = RTLIB::getFPTOUINT(ArgVT, VT);
29023 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
29024
29025 SDLoc dl(Op);
29026 MakeLibCallOptions CallOptions;
29027 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29028
29030 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
29031 // expected VT (i128).
29032 std::tie(Result, Chain) =
29033 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
29034 Result = DAG.getBitcast(VT, Result);
29035 return Result;
29036}
29037
29038SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
29039 SelectionDAG &DAG) const {
29040 assert(Subtarget.isTargetWin64() && "Unexpected target");
29041 EVT VT = Op.getValueType();
29042 bool IsStrict = Op->isStrictFPOpcode();
29043
29044 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29045 EVT ArgVT = Arg.getValueType();
29046
29047 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
29048 "Unexpected argument type for lowering");
29049
29050 RTLIB::Libcall LC;
29051 if (Op->getOpcode() == ISD::SINT_TO_FP ||
29052 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
29053 LC = RTLIB::getSINTTOFP(ArgVT, VT);
29054 else
29055 LC = RTLIB::getUINTTOFP(ArgVT, VT);
29056 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
29057
29058 SDLoc dl(Op);
29059 MakeLibCallOptions CallOptions;
29060 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29061
29062 // Pass the i128 argument as an indirect argument on the stack.
29063 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29064 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29065 MachinePointerInfo MPI =
29067 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
29068
29070 std::tie(Result, Chain) =
29071 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
29072 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
29073}
29074
29075// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
29076uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
29077 assert((Amt < 8) && "Shift/Rotation amount out of range");
29078 switch (Opcode) {
29079 case ISD::BITREVERSE:
29080 return 0x8040201008040201ULL;
29081 case ISD::SHL:
29082 return ((0x0102040810204080ULL >> (Amt)) &
29083 (0x0101010101010101ULL * (0xFF >> (Amt))));
29084 case ISD::SRL:
29085 return ((0x0102040810204080ULL << (Amt)) &
29086 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
29087 case ISD::SRA:
29088 return (getGFNICtrlImm(ISD::SRL, Amt) |
29089 (0x8080808080808080ULL >> (64 - (8 * Amt))));
29090 case ISD::ROTL:
29091 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
29092 case ISD::ROTR:
29093 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
29094 }
29095 llvm_unreachable("Unsupported GFNI opcode");
29096}
29097
29098// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
29099SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT,
29100 unsigned Amt = 0) {
29101 assert(VT.getVectorElementType() == MVT::i8 &&
29102 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
29103 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
29104 SmallVector<SDValue> MaskBits;
29105 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
29106 uint64_t Bits = (Imm >> (I % 64)) & 255;
29107 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
29108 }
29109 return DAG.getBuildVector(VT, DL, MaskBits);
29110}
29111
29112// Return true if the required (according to Opcode) shift-imm form is natively
29113// supported by the Subtarget
29114static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
29115 unsigned Opcode) {
29116 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29117 "Unexpected shift opcode");
29118
29119 if (!VT.isSimple())
29120 return false;
29121
29122 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29123 return false;
29124
29125 if (VT.getScalarSizeInBits() < 16)
29126 return false;
29127
29128 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29129 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29130 return true;
29131
29132 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29133 (VT.is256BitVector() && Subtarget.hasInt256());
29134
29135 bool AShift = LShift && (Subtarget.hasAVX512() ||
29136 (VT != MVT::v2i64 && VT != MVT::v4i64));
29137 return (Opcode == ISD::SRA) ? AShift : LShift;
29138}
29139
29140// The shift amount is a variable, but it is the same for all vector lanes.
29141// These instructions are defined together with shift-immediate.
29142static
29144 unsigned Opcode) {
29145 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29146}
29147
29148// Return true if the required (according to Opcode) variable-shift form is
29149// natively supported by the Subtarget
29150static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
29151 unsigned Opcode) {
29152 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29153 "Unexpected shift opcode");
29154
29155 if (!VT.isSimple())
29156 return false;
29157
29158 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29159 return false;
29160
29161 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29162 return false;
29163
29164 // vXi16 supported only on AVX-512, BWI
29165 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29166 return false;
29167
29168 if (Subtarget.hasAVX512() &&
29169 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29170 return true;
29171
29172 bool LShift = VT.is128BitVector() || VT.is256BitVector();
29173 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
29174 return (Opcode == ISD::SRA) ? AShift : LShift;
29175}
29176
29178 const X86Subtarget &Subtarget) {
29179 MVT VT = Op.getSimpleValueType();
29180 SDLoc dl(Op);
29181 SDValue R = Op.getOperand(0);
29182 SDValue Amt = Op.getOperand(1);
29183 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29184 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29185
29186 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29187 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
29188 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29189 SDValue Ex = DAG.getBitcast(ExVT, R);
29190
29191 // ashr(R, 63) === cmp_slt(R, 0)
29192 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29193 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
29194 "Unsupported PCMPGT op");
29195 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29196 }
29197
29198 if (ShiftAmt >= 32) {
29199 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29200 SDValue Upper =
29201 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29203 ShiftAmt - 32, DAG);
29204 if (VT == MVT::v2i64)
29205 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29206 if (VT == MVT::v4i64)
29207 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29208 {9, 1, 11, 3, 13, 5, 15, 7});
29209 } else {
29210 // SRA upper i32, SRL whole i64 and select lower i32.
29212 ShiftAmt, DAG);
29213 SDValue Lower =
29214 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29215 Lower = DAG.getBitcast(ExVT, Lower);
29216 if (VT == MVT::v2i64)
29217 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29218 if (VT == MVT::v4i64)
29219 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29220 {8, 1, 10, 3, 12, 5, 14, 7});
29221 }
29222 return DAG.getBitcast(VT, Ex);
29223 };
29224
29225 // Optimize shl/srl/sra with constant shift amount.
29226 APInt APIntShiftAmt;
29227 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29228 return SDValue();
29229
29230 // If the shift amount is out of range, return undef.
29231 if (APIntShiftAmt.uge(EltSizeInBits))
29232 return DAG.getUNDEF(VT);
29233
29234 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29235
29236 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
29237 // Hardware support for vector shifts is sparse which makes us scalarize the
29238 // vector operations in many cases. Also, on sandybridge ADD is faster than
29239 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29240 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29241 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29242 // must be 0). (add undef, undef) however can be any value. To make this
29243 // safe, we must freeze R to ensure that register allocation uses the same
29244 // register for an undefined value. This ensures that the result will
29245 // still be even and preserves the original semantics.
29246 R = DAG.getFreeze(R);
29247 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29248 }
29249
29250 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29251 }
29252
29253 // i64 SRA needs to be performed as partial shifts.
29254 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29255 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29256 Op.getOpcode() == ISD::SRA)
29257 return ArithmeticShiftRight64(ShiftAmt);
29258
29259 // If we're logical shifting an all-signbits value then we can just perform as
29260 // a mask.
29261 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
29262 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
29263 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
29264 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
29265 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
29266 }
29267
29268 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29269 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29270 unsigned NumElts = VT.getVectorNumElements();
29271 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29272
29273 // Simple i8 add case
29274 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29275 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29276 // must be 0). (add undef, undef) however can be any value. To make this
29277 // safe, we must freeze R to ensure that register allocation uses the same
29278 // register for an undefined value. This ensures that the result will
29279 // still be even and preserves the original semantics.
29280 R = DAG.getFreeze(R);
29281 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29282 }
29283
29284 // ashr(R, 7) === cmp_slt(R, 0)
29285 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29286 SDValue Zeros = DAG.getConstant(0, dl, VT);
29287 if (VT.is512BitVector()) {
29288 assert(VT == MVT::v64i8 && "Unexpected element type!");
29289 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29290 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29291 }
29292 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29293 }
29294
29295 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29296 if (VT == MVT::v16i8 && Subtarget.hasXOP())
29297 return SDValue();
29298
29299 if (Subtarget.hasGFNI()) {
29300 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
29301 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
29302 DAG.getTargetConstant(0, dl, MVT::i8));
29303 }
29304
29305 if (Op.getOpcode() == ISD::SHL) {
29306 // Make a large shift.
29307 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29308 ShiftAmt, DAG);
29309 SHL = DAG.getBitcast(VT, SHL);
29310 // Zero out the rightmost bits.
29311 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29312 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
29313 }
29314 if (Op.getOpcode() == ISD::SRL) {
29315 // Make a large shift.
29316 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
29317 ShiftAmt, DAG);
29318 SRL = DAG.getBitcast(VT, SRL);
29319 // Zero out the leftmost bits.
29320 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29321 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
29322 }
29323 if (Op.getOpcode() == ISD::SRA) {
29324 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
29325 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29326
29327 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
29328 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
29329 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
29330 return Res;
29331 }
29332 llvm_unreachable("Unknown shift opcode.");
29333 }
29334
29335 return SDValue();
29336}
29337
29339 const X86Subtarget &Subtarget) {
29340 MVT VT = Op.getSimpleValueType();
29341 SDLoc dl(Op);
29342 SDValue R = Op.getOperand(0);
29343 SDValue Amt = Op.getOperand(1);
29344 unsigned Opcode = Op.getOpcode();
29345 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
29346
29347 int BaseShAmtIdx = -1;
29348 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
29349 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
29350 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
29351 Subtarget, DAG);
29352
29353 // vXi8 shifts - shift as v8i16 + mask result.
29354 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
29355 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
29356 VT == MVT::v64i8) &&
29357 !Subtarget.hasXOP()) {
29358 unsigned NumElts = VT.getVectorNumElements();
29359 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29360 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
29361 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
29362 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
29363
29364 // Create the mask using vXi16 shifts. For shift-rights we need to move
29365 // the upper byte down before splatting the vXi8 mask.
29366 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
29367 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
29368 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
29369 if (Opcode != ISD::SHL)
29370 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
29371 8, DAG);
29372 BitMask = DAG.getBitcast(VT, BitMask);
29373 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
29374 SmallVector<int, 64>(NumElts, 0));
29375
29376 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
29377 DAG.getBitcast(ExtVT, R), BaseShAmt,
29378 BaseShAmtIdx, Subtarget, DAG);
29379 Res = DAG.getBitcast(VT, Res);
29380 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
29381
29382 if (Opcode == ISD::SRA) {
29383 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
29384 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
29385 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
29386 SignMask =
29387 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
29388 BaseShAmtIdx, Subtarget, DAG);
29389 SignMask = DAG.getBitcast(VT, SignMask);
29390 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
29391 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
29392 }
29393 return Res;
29394 }
29395 }
29396 }
29397
29398 return SDValue();
29399}
29400
29401// Convert a shift/rotate left amount to a multiplication scale factor.
29403 const X86Subtarget &Subtarget,
29404 SelectionDAG &DAG) {
29405 MVT VT = Amt.getSimpleValueType();
29406 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
29407 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
29408 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
29409 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
29410 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29411 (Subtarget.hasBWI() && VT == MVT::v64i8)))
29412 return SDValue();
29413
29414 MVT SVT = VT.getVectorElementType();
29415 unsigned SVTBits = SVT.getSizeInBits();
29416 unsigned NumElems = VT.getVectorNumElements();
29417
29418 APInt UndefElts;
29419 SmallVector<APInt> EltBits;
29420 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
29421 APInt One(SVTBits, 1);
29422 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
29423 for (unsigned I = 0; I != NumElems; ++I) {
29424 if (UndefElts[I] || EltBits[I].uge(SVTBits))
29425 continue;
29426 uint64_t ShAmt = EltBits[I].getZExtValue();
29427 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
29428 }
29429 return DAG.getBuildVector(VT, dl, Elts);
29430 }
29431
29432 // If the target doesn't support variable shifts, use either FP conversion
29433 // or integer multiplication to avoid shifting each element individually.
29434 if (VT == MVT::v4i32) {
29435 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
29436 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
29437 DAG.getConstant(0x3f800000U, dl, VT));
29438 Amt = DAG.getBitcast(MVT::v4f32, Amt);
29439 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
29440 }
29441
29442 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
29443 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
29444 SDValue Z = DAG.getConstant(0, dl, VT);
29445 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
29446 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
29447 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
29448 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
29449 if (Subtarget.hasSSE41())
29450 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29451 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
29452 }
29453
29454 return SDValue();
29455}
29456
29457static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
29458 SelectionDAG &DAG) {
29459 MVT VT = Op.getSimpleValueType();
29460 SDLoc dl(Op);
29461 SDValue R = Op.getOperand(0);
29462 SDValue Amt = Op.getOperand(1);
29463 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29464 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29465
29466 unsigned Opc = Op.getOpcode();
29467 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
29468 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
29469
29470 assert(VT.isVector() && "Custom lowering only for vector shifts!");
29471 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
29472
29473 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
29474 return V;
29475
29476 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
29477 return V;
29478
29479 if (supportedVectorVarShift(VT, Subtarget, Opc))
29480 return Op;
29481
29482 // i64 vector arithmetic shift can be emulated with the transform:
29483 // M = lshr(SIGN_MASK, Amt)
29484 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29485 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
29486 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29487 Opc == ISD::SRA) {
29488 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29489 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29490 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29491 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29492 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29493 return R;
29494 }
29495
29496 // XOP has 128-bit variable logical/arithmetic shifts.
29497 // +ve/-ve Amt = shift left/right.
29498 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29499 VT == MVT::v8i16 || VT == MVT::v16i8)) {
29500 if (Opc == ISD::SRL || Opc == ISD::SRA)
29501 Amt = DAG.getNegative(Amt, dl, VT);
29502 if (Opc == ISD::SHL || Opc == ISD::SRL)
29503 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29504 if (Opc == ISD::SRA)
29505 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29506 }
29507
29508 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29509 // shifts per-lane and then shuffle the partial results back together.
29510 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29511 // Splat the shift amounts so the scalar shifts above will catch it.
29512 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29513 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29514 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29515 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29516 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29517 }
29518
29519 // If possible, lower this shift as a sequence of two shifts by
29520 // constant plus a BLENDing shuffle instead of scalarizing it.
29521 // Example:
29522 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29523 //
29524 // Could be rewritten as:
29525 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29526 //
29527 // The advantage is that the two shifts from the example would be
29528 // lowered as X86ISD::VSRLI nodes in parallel before blending.
29529 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29530 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29531 SDValue Amt1, Amt2;
29532 unsigned NumElts = VT.getVectorNumElements();
29533 SmallVector<int, 8> ShuffleMask;
29534 for (unsigned i = 0; i != NumElts; ++i) {
29535 SDValue A = Amt->getOperand(i);
29536 if (A.isUndef()) {
29537 ShuffleMask.push_back(SM_SentinelUndef);
29538 continue;
29539 }
29540 if (!Amt1 || Amt1 == A) {
29541 ShuffleMask.push_back(i);
29542 Amt1 = A;
29543 continue;
29544 }
29545 if (!Amt2 || Amt2 == A) {
29546 ShuffleMask.push_back(i + NumElts);
29547 Amt2 = A;
29548 continue;
29549 }
29550 break;
29551 }
29552
29553 // Only perform this blend if we can perform it without loading a mask.
29554 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29555 (VT != MVT::v16i16 ||
29556 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29557 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29558 canWidenShuffleElements(ShuffleMask))) {
29559 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29560 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29561 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29562 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29563 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29564 Cst1->getZExtValue(), DAG);
29565 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29566 Cst2->getZExtValue(), DAG);
29567 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29568 }
29569 }
29570 }
29571
29572 // If possible, lower this packed shift into a vector multiply instead of
29573 // expanding it into a sequence of scalar shifts.
29574 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
29575 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
29576 Subtarget.canExtendTo512BW())))
29577 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29578 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29579
29580 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29581 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29582 if (Opc == ISD::SRL && ConstantAmt &&
29583 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29584 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29585 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29586 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29587 SDValue Zero = DAG.getConstant(0, dl, VT);
29588 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29589 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29590 return DAG.getSelect(dl, VT, ZAmt, R, Res);
29591 }
29592 }
29593
29594 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29595 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29596 // TODO: Special case handling for shift by 0/1, really we can afford either
29597 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29598 if (Opc == ISD::SRA && ConstantAmt &&
29599 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29600 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29601 !Subtarget.hasAVX512()) ||
29602 DAG.isKnownNeverZero(Amt))) {
29603 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29604 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29605 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29606 SDValue Amt0 =
29607 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29608 SDValue Amt1 =
29609 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29610 SDValue Sra1 =
29611 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29612 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29613 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29614 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29615 }
29616 }
29617
29618 // v4i32 Non Uniform Shifts.
29619 // If the shift amount is constant we can shift each lane using the SSE2
29620 // immediate shifts, else we need to zero-extend each lane to the lower i64
29621 // and shift using the SSE2 variable shifts.
29622 // The separate results can then be blended together.
29623 if (VT == MVT::v4i32) {
29624 SDValue Amt0, Amt1, Amt2, Amt3;
29625 if (ConstantAmt) {
29626 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29627 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29628 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29629 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29630 } else {
29631 // The SSE2 shifts use the lower i64 as the same shift amount for
29632 // all lanes and the upper i64 is ignored. On AVX we're better off
29633 // just zero-extending, but for SSE just duplicating the top 16-bits is
29634 // cheaper and has the same effect for out of range values.
29635 if (Subtarget.hasAVX()) {
29636 SDValue Z = DAG.getConstant(0, dl, VT);
29637 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29638 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29639 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29640 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29641 } else {
29642 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29643 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29644 {4, 5, 6, 7, -1, -1, -1, -1});
29645 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
29646 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
29647 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
29648 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
29649 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
29650 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
29651 }
29652 }
29653
29654 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29655 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29656 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29657 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29658 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29659
29660 // Merge the shifted lane results optimally with/without PBLENDW.
29661 // TODO - ideally shuffle combining would handle this.
29662 if (Subtarget.hasSSE41()) {
29663 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29664 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29665 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29666 }
29667 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29668 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29669 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29670 }
29671
29672 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29673 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29674 // make the existing SSE solution better.
29675 // NOTE: We honor prefered vector width before promoting to 512-bits.
29676 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29677 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29678 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29679 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29680 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29681 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
29682 "Unexpected vector type");
29683 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29684 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29685 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29686 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29687 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29688 return DAG.getNode(ISD::TRUNCATE, dl, VT,
29689 DAG.getNode(Opc, dl, ExtVT, R, Amt));
29690 }
29691
29692 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29693 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29694 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29695 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29696 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29697 !Subtarget.hasXOP()) {
29698 int NumElts = VT.getVectorNumElements();
29699 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29700
29701 // Extend constant shift amount to vXi16 (it doesn't matter if the type
29702 // isn't legal).
29703 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29704 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29705 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29706 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29708 "Constant build vector expected");
29709
29710 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29711 bool IsSigned = Opc == ISD::SRA;
29712 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
29713 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29714 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29715 return DAG.getZExtOrTrunc(R, dl, VT);
29716 }
29717
29718 SmallVector<SDValue, 16> LoAmt, HiAmt;
29719 for (int i = 0; i != NumElts; i += 16) {
29720 for (int j = 0; j != 8; ++j) {
29721 LoAmt.push_back(Amt.getOperand(i + j));
29722 HiAmt.push_back(Amt.getOperand(i + j + 8));
29723 }
29724 }
29725
29726 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29727 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29728 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29729
29730 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29731 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29732 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29733 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29734 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29735 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29736 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29737 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29738 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29739 }
29740
29741 if (VT == MVT::v16i8 ||
29742 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29743 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29744 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29745
29746 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29747 if (VT.is512BitVector()) {
29748 // On AVX512BW targets we make use of the fact that VSELECT lowers
29749 // to a masked blend which selects bytes based just on the sign bit
29750 // extracted to a mask.
29751 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29752 V0 = DAG.getBitcast(VT, V0);
29753 V1 = DAG.getBitcast(VT, V1);
29754 Sel = DAG.getBitcast(VT, Sel);
29755 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29756 ISD::SETGT);
29757 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29758 } else if (Subtarget.hasSSE41()) {
29759 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29760 // on the sign bit.
29761 V0 = DAG.getBitcast(VT, V0);
29762 V1 = DAG.getBitcast(VT, V1);
29763 Sel = DAG.getBitcast(VT, Sel);
29764 return DAG.getBitcast(SelVT,
29765 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29766 }
29767 // On pre-SSE41 targets we test for the sign bit by comparing to
29768 // zero - a negative value will set all bits of the lanes to true
29769 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29770 SDValue Z = DAG.getConstant(0, dl, SelVT);
29771 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29772 return DAG.getSelect(dl, SelVT, C, V0, V1);
29773 };
29774
29775 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29776 // We can safely do this using i16 shifts as we're only interested in
29777 // the 3 lower bits of each byte.
29778 Amt = DAG.getBitcast(ExtVT, Amt);
29779 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29780 Amt = DAG.getBitcast(VT, Amt);
29781
29782 if (Opc == ISD::SHL || Opc == ISD::SRL) {
29783 // r = VSELECT(r, shift(r, 4), a);
29784 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29785 R = SignBitSelect(VT, Amt, M, R);
29786
29787 // a += a
29788 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29789
29790 // r = VSELECT(r, shift(r, 2), a);
29791 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29792 R = SignBitSelect(VT, Amt, M, R);
29793
29794 // a += a
29795 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29796
29797 // return VSELECT(r, shift(r, 1), a);
29798 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29799 R = SignBitSelect(VT, Amt, M, R);
29800 return R;
29801 }
29802
29803 if (Opc == ISD::SRA) {
29804 // For SRA we need to unpack each byte to the higher byte of a i16 vector
29805 // so we can correctly sign extend. We don't care what happens to the
29806 // lower byte.
29807 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29808 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29809 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29810 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29811 ALo = DAG.getBitcast(ExtVT, ALo);
29812 AHi = DAG.getBitcast(ExtVT, AHi);
29813 RLo = DAG.getBitcast(ExtVT, RLo);
29814 RHi = DAG.getBitcast(ExtVT, RHi);
29815
29816 // r = VSELECT(r, shift(r, 4), a);
29817 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29818 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29819 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29820 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29821
29822 // a += a
29823 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29824 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29825
29826 // r = VSELECT(r, shift(r, 2), a);
29827 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29828 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29829 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29830 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29831
29832 // a += a
29833 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29834 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29835
29836 // r = VSELECT(r, shift(r, 1), a);
29837 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29838 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29839 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29840 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29841
29842 // Logical shift the result back to the lower byte, leaving a zero upper
29843 // byte meaning that we can safely pack with PACKUSWB.
29844 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29845 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29846 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29847 }
29848 }
29849
29850 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29851 MVT ExtVT = MVT::v8i32;
29852 SDValue Z = DAG.getConstant(0, dl, VT);
29853 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29854 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29855 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29856 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29857 ALo = DAG.getBitcast(ExtVT, ALo);
29858 AHi = DAG.getBitcast(ExtVT, AHi);
29859 RLo = DAG.getBitcast(ExtVT, RLo);
29860 RHi = DAG.getBitcast(ExtVT, RHi);
29861 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29862 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29863 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29864 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29865 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29866 }
29867
29868 if (VT == MVT::v8i16) {
29869 // If we have a constant shift amount, the non-SSE41 path is best as
29870 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29871 bool UseSSE41 = Subtarget.hasSSE41() &&
29873
29874 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29875 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29876 // the sign bit.
29877 if (UseSSE41) {
29878 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29879 V0 = DAG.getBitcast(ExtVT, V0);
29880 V1 = DAG.getBitcast(ExtVT, V1);
29881 Sel = DAG.getBitcast(ExtVT, Sel);
29882 return DAG.getBitcast(
29883 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29884 }
29885 // On pre-SSE41 targets we splat the sign bit - a negative value will
29886 // set all bits of the lanes to true and VSELECT uses that in
29887 // its OR(AND(V0,C),AND(V1,~C)) lowering.
29888 SDValue C =
29889 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29890 return DAG.getSelect(dl, VT, C, V0, V1);
29891 };
29892
29893 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29894 if (UseSSE41) {
29895 // On SSE41 targets we need to replicate the shift mask in both
29896 // bytes for PBLENDVB.
29897 Amt = DAG.getNode(
29898 ISD::OR, dl, VT,
29899 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29900 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29901 } else {
29902 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29903 }
29904
29905 // r = VSELECT(r, shift(r, 8), a);
29906 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29907 R = SignBitSelect(Amt, M, R);
29908
29909 // a += a
29910 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29911
29912 // r = VSELECT(r, shift(r, 4), a);
29913 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29914 R = SignBitSelect(Amt, M, R);
29915
29916 // a += a
29917 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29918
29919 // r = VSELECT(r, shift(r, 2), a);
29920 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29921 R = SignBitSelect(Amt, M, R);
29922
29923 // a += a
29924 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29925
29926 // return VSELECT(r, shift(r, 1), a);
29927 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29928 R = SignBitSelect(Amt, M, R);
29929 return R;
29930 }
29931
29932 // Decompose 256-bit shifts into 128-bit shifts.
29933 if (VT.is256BitVector())
29934 return splitVectorIntBinary(Op, DAG, dl);
29935
29936 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29937 return splitVectorIntBinary(Op, DAG, dl);
29938
29939 return SDValue();
29940}
29941
29943 SelectionDAG &DAG) {
29944 MVT VT = Op.getSimpleValueType();
29945 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
29946 "Unexpected funnel shift opcode!");
29947
29948 SDLoc DL(Op);
29949 SDValue Op0 = Op.getOperand(0);
29950 SDValue Op1 = Op.getOperand(1);
29951 SDValue Amt = Op.getOperand(2);
29952 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29953 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
29954
29955 if (VT.isVector()) {
29956 APInt APIntShiftAmt;
29957 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
29958 unsigned NumElts = VT.getVectorNumElements();
29959
29960 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
29961 if (IsFSHR)
29962 std::swap(Op0, Op1);
29963
29964 if (IsCstSplat) {
29965 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29966 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
29967 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
29968 {Op0, Op1, Imm}, DAG, Subtarget);
29969 }
29970 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
29971 {Op0, Op1, Amt}, DAG, Subtarget);
29972 }
29973 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
29974 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
29975 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
29976 "Unexpected funnel shift type!");
29977
29978 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
29979 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
29980 if (IsCstSplat) {
29981 // TODO: Can't use generic expansion as UNDEF amt elements can be
29982 // converted to other values when folded to shift amounts, losing the
29983 // splat.
29984 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29985 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
29986 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
29987 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
29988 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29989
29990 if (EltSizeInBits == 8 &&
29991 (Subtarget.hasXOP() ||
29992 (useVPTERNLOG(Subtarget, VT) &&
29993 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
29994 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
29995 // bit-select - lower using vXi16 shifts and then perform the bitmask at
29996 // the original vector width to handle cases where we split.
29997 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
29998 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
29999 SDValue ShX =
30000 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
30001 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
30002 SDValue ShY =
30003 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
30004 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
30005 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
30006 DAG.getConstant(MaskX, DL, VT));
30007 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
30008 DAG.getConstant(MaskY, DL, VT));
30009 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
30010 }
30011
30012 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
30013 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
30014 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
30015 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
30016 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
30017 }
30018
30019 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30020 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30021 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
30022
30023 // Constant vXi16 funnel shifts can be efficiently handled by default.
30024 if (IsCst && EltSizeInBits == 16)
30025 return SDValue();
30026
30027 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
30028 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30029 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30030
30031 // Split 256-bit integers on XOP/pre-AVX2 targets.
30032 // Split 512-bit integers on non 512-bit BWI targets.
30033 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
30034 !Subtarget.hasAVX2())) ||
30035 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
30036 EltSizeInBits < 32)) {
30037 // Pre-mask the amount modulo using the wider vector.
30038 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
30039 return splitVectorOp(Op, DAG, DL);
30040 }
30041
30042 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
30043 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
30044 int ScalarAmtIdx = -1;
30045 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
30046 // Uniform vXi16 funnel shifts can be efficiently handled by default.
30047 if (EltSizeInBits == 16)
30048 return SDValue();
30049
30050 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30051 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30052 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
30053 ScalarAmtIdx, Subtarget, DAG);
30054 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
30055 ScalarAmtIdx, Subtarget, DAG);
30056 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30057 }
30058 }
30059
30060 MVT WideSVT = MVT::getIntegerVT(
30061 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
30062 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
30063
30064 // If per-element shifts are legal, fallback to generic expansion.
30065 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
30066 return SDValue();
30067
30068 // Attempt to fold as:
30069 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30070 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30071 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30072 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30073 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
30074 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
30075 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30076 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
30077 EltSizeInBits, DAG);
30078 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
30079 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
30080 if (!IsFSHR)
30081 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
30082 EltSizeInBits, DAG);
30083 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
30084 }
30085
30086 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
30087 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
30088 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
30089 SDValue Z = DAG.getConstant(0, DL, VT);
30090 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30091 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30092 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30093 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30094 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30095 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30096 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30097 }
30098
30099 // Fallback to generic expansion.
30100 return SDValue();
30101 }
30102 assert(
30103 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
30104 "Unexpected funnel shift type!");
30105
30106 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
30107 bool OptForSize = DAG.shouldOptForSize();
30108 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
30109
30110 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30111 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30112 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
30113 !isa<ConstantSDNode>(Amt)) {
30114 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
30115 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
30116 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
30117 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
30118 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
30119 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
30120 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
30121 if (IsFSHR) {
30122 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
30123 } else {
30124 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
30125 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
30126 }
30127 return DAG.getZExtOrTrunc(Res, DL, VT);
30128 }
30129
30130 if (VT == MVT::i8 || ExpandFunnel)
30131 return SDValue();
30132
30133 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
30134 if (VT == MVT::i16) {
30135 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
30136 DAG.getConstant(15, DL, Amt.getValueType()));
30137 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
30138 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
30139 }
30140
30141 return Op;
30142}
30143
30144static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
30145 SelectionDAG &DAG) {
30146 MVT VT = Op.getSimpleValueType();
30147 assert(VT.isVector() && "Custom lowering only for vector rotates!");
30148
30149 SDLoc DL(Op);
30150 SDValue R = Op.getOperand(0);
30151 SDValue Amt = Op.getOperand(1);
30152 unsigned Opcode = Op.getOpcode();
30153 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30154 int NumElts = VT.getVectorNumElements();
30155 bool IsROTL = Opcode == ISD::ROTL;
30156
30157 // Check for constant splat rotation amount.
30158 APInt CstSplatValue;
30159 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
30160
30161 // Check for splat rotate by zero.
30162 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
30163 return R;
30164
30165 // AVX512 implicitly uses modulo rotation amounts.
30166 if ((Subtarget.hasVLX() ||
30167 (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
30168 32 <= EltSizeInBits) {
30169 // Attempt to rotate by immediate.
30170 if (IsCstSplat) {
30171 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
30172 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30173 return DAG.getNode(RotOpc, DL, VT, R,
30174 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30175 }
30176
30177 // Else, fall-back on VPROLV/VPRORV.
30178 return Op;
30179 }
30180
30181 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
30182 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
30183 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30184 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30185 }
30186
30187 SDValue Z = DAG.getConstant(0, DL, VT);
30188
30189 if (!IsROTL) {
30190 // If the ISD::ROTR amount is constant, we're always better converting to
30191 // ISD::ROTL.
30192 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
30193 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
30194
30195 // XOP targets always prefers ISD::ROTL.
30196 if (Subtarget.hasXOP())
30197 return DAG.getNode(ISD::ROTL, DL, VT, R,
30198 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
30199 }
30200
30201 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
30202 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
30204 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30205 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
30206 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
30207 DAG.getTargetConstant(0, DL, MVT::i8));
30208 }
30209
30210 // Split 256-bit integers on XOP/pre-AVX2 targets.
30211 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
30212 return splitVectorIntBinary(Op, DAG, DL);
30213
30214 // XOP has 128-bit vector variable + immediate rotates.
30215 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
30216 // XOP implicitly uses modulo rotation amounts.
30217 if (Subtarget.hasXOP()) {
30218 assert(IsROTL && "Only ROTL expected");
30219 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
30220
30221 // Attempt to rotate by immediate.
30222 if (IsCstSplat) {
30223 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30224 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
30225 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30226 }
30227
30228 // Use general rotate by variable (per-element).
30229 return Op;
30230 }
30231
30232 // Rotate by an uniform constant - expand back to shifts.
30233 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
30234 // to other values when folded to shift amounts, losing the splat.
30235 if (IsCstSplat) {
30236 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30237 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
30238 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
30239 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
30240 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
30241 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
30242 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
30243 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
30244 }
30245
30246 // Split 512-bit integers on non 512-bit BWI targets.
30247 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
30248 return splitVectorIntBinary(Op, DAG, DL);
30249
30250 assert(
30251 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
30252 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
30253 Subtarget.hasAVX2()) ||
30254 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
30255 "Only vXi32/vXi16/vXi8 vector rotates supported");
30256
30257 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30258 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30259
30260 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30261 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30262
30263 // Attempt to fold as unpack(x,x) << zext(splat(y)):
30264 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30265 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30266 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
30267 int BaseRotAmtIdx = -1;
30268 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
30269 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
30270 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30271 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30272 }
30273 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
30274 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30275 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30276 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
30277 BaseRotAmtIdx, Subtarget, DAG);
30278 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
30279 BaseRotAmtIdx, Subtarget, DAG);
30280 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30281 }
30282 }
30283
30284 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30285 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
30286
30287 // Attempt to fold as unpack(x,x) << zext(y):
30288 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30289 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30290 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
30291 if (!(ConstantAmt && EltSizeInBits != 8) &&
30292 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
30293 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
30294 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30295 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30296 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30297 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30298 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30299 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30300 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30301 }
30302
30303 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
30304 // the amount bit.
30305 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
30306 if (EltSizeInBits == 8) {
30307 MVT WideVT =
30308 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
30309
30310 // Attempt to fold as:
30311 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
30312 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
30313 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30314 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30315 // If we're rotating by constant, just use default promotion.
30316 if (ConstantAmt)
30317 return SDValue();
30318 // See if we can perform this by widening to vXi16 or vXi32.
30319 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
30320 R = DAG.getNode(
30321 ISD::OR, DL, WideVT, R,
30322 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
30323 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30324 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
30325 if (IsROTL)
30326 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
30327 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
30328 }
30329
30330 // We don't need ModuloAmt here as we just peek at individual bits.
30331 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30332 if (Subtarget.hasSSE41()) {
30333 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30334 // on the sign bit.
30335 V0 = DAG.getBitcast(VT, V0);
30336 V1 = DAG.getBitcast(VT, V1);
30337 Sel = DAG.getBitcast(VT, Sel);
30338 return DAG.getBitcast(SelVT,
30339 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
30340 }
30341 // On pre-SSE41 targets we test for the sign bit by comparing to
30342 // zero - a negative value will set all bits of the lanes to true
30343 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30344 SDValue Z = DAG.getConstant(0, DL, SelVT);
30345 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
30346 return DAG.getSelect(DL, SelVT, C, V0, V1);
30347 };
30348
30349 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
30350 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
30351 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30352 IsROTL = true;
30353 }
30354
30355 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
30356 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
30357
30358 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30359 // We can safely do this using i16 shifts as we're only interested in
30360 // the 3 lower bits of each byte.
30361 Amt = DAG.getBitcast(ExtVT, Amt);
30362 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
30363 Amt = DAG.getBitcast(VT, Amt);
30364
30365 // r = VSELECT(r, rot(r, 4), a);
30366 SDValue M;
30367 M = DAG.getNode(
30368 ISD::OR, DL, VT,
30369 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
30370 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
30371 R = SignBitSelect(VT, Amt, M, R);
30372
30373 // a += a
30374 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30375
30376 // r = VSELECT(r, rot(r, 2), a);
30377 M = DAG.getNode(
30378 ISD::OR, DL, VT,
30379 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
30380 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
30381 R = SignBitSelect(VT, Amt, M, R);
30382
30383 // a += a
30384 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30385
30386 // return VSELECT(r, rot(r, 1), a);
30387 M = DAG.getNode(
30388 ISD::OR, DL, VT,
30389 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
30390 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
30391 return SignBitSelect(VT, Amt, M, R);
30392 }
30393
30394 bool IsSplatAmt = DAG.isSplatValue(Amt);
30395 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
30396 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
30397
30398 // Fallback for splats + all supported variable shifts.
30399 // Fallback for non-constants AVX2 vXi16 as well.
30400 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
30401 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30402 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
30403 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
30404 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
30405 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
30406 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
30407 }
30408
30409 // Everything below assumes ISD::ROTL.
30410 if (!IsROTL) {
30411 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30412 IsROTL = true;
30413 }
30414
30415 // ISD::ROT* uses modulo rotate amounts.
30416 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30417
30418 assert(IsROTL && "Only ROTL supported");
30419
30420 // As with shifts, attempt to convert the rotation amount to a multiplication
30421 // factor, fallback to general expansion.
30422 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
30423 if (!Scale)
30424 return SDValue();
30425
30426 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
30427 if (EltSizeInBits == 16) {
30428 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
30429 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
30430 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30431 }
30432
30433 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
30434 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
30435 // that can then be OR'd with the lower 32-bits.
30436 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
30437 static const int OddMask[] = {1, -1, 3, -1};
30438 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
30439 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
30440
30441 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30442 DAG.getBitcast(MVT::v2i64, R),
30443 DAG.getBitcast(MVT::v2i64, Scale));
30444 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30445 DAG.getBitcast(MVT::v2i64, R13),
30446 DAG.getBitcast(MVT::v2i64, Scale13));
30447 Res02 = DAG.getBitcast(VT, Res02);
30448 Res13 = DAG.getBitcast(VT, Res13);
30449
30450 return DAG.getNode(ISD::OR, DL, VT,
30451 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
30452 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
30453}
30454
30455/// Returns true if the operand type is exactly twice the native width, and
30456/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
30457/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
30458/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
30459bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
30460 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30461
30462 if (OpWidth == 64)
30463 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
30464 if (OpWidth == 128)
30465 return Subtarget.canUseCMPXCHG16B();
30466
30467 return false;
30468}
30469
30471X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
30472 Type *MemType = SI->getValueOperand()->getType();
30473
30474 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
30475 !Subtarget.useSoftFloat()) {
30476 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30477 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30479
30480 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
30481 Subtarget.hasAVX())
30483 }
30484
30485 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
30487}
30488
30489// Note: this turns large loads into lock cmpxchg8b/16b.
30491X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
30492 Type *MemType = LI->getType();
30493
30494 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
30495 !Subtarget.useSoftFloat()) {
30496 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30497 // can use movq to do the load. If we have X87 we can load into an 80-bit
30498 // X87 register and store it to a stack temporary.
30499 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30500 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30502
30503 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
30504 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
30505 Subtarget.hasAVX())
30507 }
30508
30509 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30511}
30512
30513enum BitTestKind : unsigned {
30520
30521static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
30522 using namespace llvm::PatternMatch;
30523 BitTestKind BTK = UndefBit;
30524 auto *C = dyn_cast<ConstantInt>(V);
30525 if (C) {
30526 // Check if V is a power of 2 or NOT power of 2.
30527 if (isPowerOf2_64(C->getZExtValue()))
30528 BTK = ConstantBit;
30529 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
30530 BTK = NotConstantBit;
30531 return {V, BTK};
30532 }
30533
30534 // Check if V is some power of 2 pattern known to be non-zero
30535 auto *I = dyn_cast<Instruction>(V);
30536 if (I) {
30537 bool Not = false;
30538 // Check if we have a NOT
30539 Value *PeekI;
30540 if (match(I, m_Not(m_Value(PeekI))) ||
30541 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
30542 Not = true;
30543 I = dyn_cast<Instruction>(PeekI);
30544
30545 // If I is constant, it will fold and we can evaluate later. If its an
30546 // argument or something of that nature, we can't analyze.
30547 if (I == nullptr)
30548 return {nullptr, UndefBit};
30549 }
30550 // We can only use 1 << X without more sophisticated analysis. C << X where
30551 // C is a power of 2 but not 1 can result in zero which cannot be translated
30552 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
30553 if (I->getOpcode() == Instruction::Shl) {
30554 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
30555 // -X` and some other provable power of 2 patterns that we can use CTZ on
30556 // may be profitable.
30557 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
30558 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
30559 // be provably a non-zero power of 2.
30560 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
30561 // transformable to bittest.
30562 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
30563 if (!ShiftVal)
30564 return {nullptr, UndefBit};
30565 if (ShiftVal->equalsInt(1))
30566 BTK = Not ? NotShiftBit : ShiftBit;
30567
30568 if (BTK == UndefBit)
30569 return {nullptr, UndefBit};
30570
30571 Value *BitV = I->getOperand(1);
30572
30573 Value *AndOp;
30574 const APInt *AndC;
30575 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
30576 // Read past a shiftmask instruction to find count
30577 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
30578 BitV = AndOp;
30579 }
30580 return {BitV, BTK};
30581 }
30582 }
30583 return {nullptr, UndefBit};
30584}
30585
30587X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
30588 using namespace llvm::PatternMatch;
30589 // If the atomicrmw's result isn't actually used, we can just add a "lock"
30590 // prefix to a normal instruction for these operations.
30591 if (AI->use_empty())
30593
30594 if (AI->getOperation() == AtomicRMWInst::Xor) {
30595 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
30596 // preferable to both `cmpxchg` and `btc`.
30597 if (match(AI->getOperand(1), m_SignMask()))
30599 }
30600
30601 // If the atomicrmw's result is used by a single bit AND, we may use
30602 // bts/btr/btc instruction for these operations.
30603 // Note: InstCombinePass can cause a de-optimization here. It replaces the
30604 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
30605 // (depending on CC). This pattern can only use bts/btr/btc but we don't
30606 // detect it.
30607 Instruction *I = AI->user_back();
30608 auto BitChange = FindSingleBitChange(AI->getValOperand());
30609 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
30610 I->getOpcode() != Instruction::And ||
30611 AI->getType()->getPrimitiveSizeInBits() == 8 ||
30612 AI->getParent() != I->getParent())
30614
30615 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
30616
30617 // This is a redundant AND, it should get cleaned up elsewhere.
30618 if (AI == I->getOperand(OtherIdx))
30620
30621 // The following instruction must be a AND single bit.
30622 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
30623 auto *C1 = cast<ConstantInt>(AI->getValOperand());
30624 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
30625 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
30627 }
30628 if (AI->getOperation() == AtomicRMWInst::And) {
30629 return ~C1->getValue() == C2->getValue()
30632 }
30635 }
30636
30637 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
30638
30639 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
30640 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
30642
30643 assert(BitChange.first != nullptr && BitTested.first != nullptr);
30644
30645 // If shift amounts are not the same we can't use BitTestIntrinsic.
30646 if (BitChange.first != BitTested.first)
30648
30649 // If atomic AND need to be masking all be one bit and testing the one bit
30650 // unset in the mask.
30651 if (AI->getOperation() == AtomicRMWInst::And)
30652 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
30655
30656 // If atomic XOR/OR need to be setting and testing the same bit.
30657 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
30660}
30661
30662void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
30663 IRBuilder<> Builder(AI);
30664 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30667 switch (AI->getOperation()) {
30668 default:
30669 llvm_unreachable("Unknown atomic operation");
30670 case AtomicRMWInst::Or:
30671 IID_C = Intrinsic::x86_atomic_bts;
30672 IID_I = Intrinsic::x86_atomic_bts_rm;
30673 break;
30674 case AtomicRMWInst::Xor:
30675 IID_C = Intrinsic::x86_atomic_btc;
30676 IID_I = Intrinsic::x86_atomic_btc_rm;
30677 break;
30678 case AtomicRMWInst::And:
30679 IID_C = Intrinsic::x86_atomic_btr;
30680 IID_I = Intrinsic::x86_atomic_btr_rm;
30681 break;
30682 }
30683 Instruction *I = AI->user_back();
30684 LLVMContext &Ctx = AI->getContext();
30685 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30687 Function *BitTest = nullptr;
30688 Value *Result = nullptr;
30689 auto BitTested = FindSingleBitChange(AI->getValOperand());
30690 assert(BitTested.first != nullptr);
30691
30692 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
30693 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
30694
30695 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
30696
30697 unsigned Imm = llvm::countr_zero(C->getZExtValue());
30698 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
30699 } else {
30700 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
30701
30702 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
30703
30704 Value *SI = BitTested.first;
30705 assert(SI != nullptr);
30706
30707 // BT{S|R|C} on memory operand don't modulo bit position so we need to
30708 // mask it.
30709 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
30710 Value *BitPos =
30711 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
30712 // Todo(1): In many cases it may be provable that SI is less than
30713 // ShiftBits in which case this mask is unnecessary
30714 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
30715 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
30716 // favor of just a raw BT{S|R|C}.
30717
30718 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
30719 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
30720
30721 // If the result is only used for zero/non-zero status then we don't need to
30722 // shift value back. Otherwise do so.
30723 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
30724 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
30725 if (ICmp->isEquality()) {
30726 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
30727 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
30728 if (C0 || C1) {
30729 assert(C0 == nullptr || C1 == nullptr);
30730 if ((C0 ? C0 : C1)->isZero())
30731 continue;
30732 }
30733 }
30734 }
30735 Result = Builder.CreateShl(Result, BitPos);
30736 break;
30737 }
30738 }
30739
30740 I->replaceAllUsesWith(Result);
30741 I->eraseFromParent();
30742 AI->eraseFromParent();
30743}
30744
30746 using namespace llvm::PatternMatch;
30747 if (!AI->hasOneUse())
30748 return false;
30749
30750 Value *Op = AI->getOperand(1);
30752 Instruction *I = AI->user_back();
30754 if (Opc == AtomicRMWInst::Add) {
30755 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
30756 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30757 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
30758 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30759 return Pred == CmpInst::ICMP_SLT;
30760 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30761 return Pred == CmpInst::ICMP_SGT;
30762 }
30763 return false;
30764 }
30765 if (Opc == AtomicRMWInst::Sub) {
30766 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30767 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30768 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
30769 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30770 return Pred == CmpInst::ICMP_SLT;
30771 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30772 return Pred == CmpInst::ICMP_SGT;
30773 }
30774 return false;
30775 }
30776 if ((Opc == AtomicRMWInst::Or &&
30778 (Opc == AtomicRMWInst::And &&
30780 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30781 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
30782 Pred == CmpInst::ICMP_SLT;
30783 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30784 return Pred == CmpInst::ICMP_SGT;
30785 return false;
30786 }
30787 if (Opc == AtomicRMWInst::Xor) {
30788 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30789 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30790 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
30791 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30792 return Pred == CmpInst::ICMP_SLT;
30793 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30794 return Pred == CmpInst::ICMP_SGT;
30795 }
30796 return false;
30797 }
30798
30799 return false;
30800}
30801
30802void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
30803 AtomicRMWInst *AI) const {
30804 IRBuilder<> Builder(AI);
30805 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30806 Instruction *TempI = nullptr;
30807 LLVMContext &Ctx = AI->getContext();
30808 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
30809 if (!ICI) {
30810 TempI = AI->user_back();
30811 assert(TempI->hasOneUse() && "Must have one use");
30812 ICI = cast<ICmpInst>(TempI->user_back());
30813 }
30815 ICmpInst::Predicate Pred = ICI->getPredicate();
30816 switch (Pred) {
30817 default:
30818 llvm_unreachable("Not supported Pred");
30819 case CmpInst::ICMP_EQ:
30820 CC = X86::COND_E;
30821 break;
30822 case CmpInst::ICMP_NE:
30823 CC = X86::COND_NE;
30824 break;
30825 case CmpInst::ICMP_SLT:
30826 CC = X86::COND_S;
30827 break;
30828 case CmpInst::ICMP_SGT:
30829 CC = X86::COND_NS;
30830 break;
30831 }
30833 switch (AI->getOperation()) {
30834 default:
30835 llvm_unreachable("Unknown atomic operation");
30836 case AtomicRMWInst::Add:
30837 IID = Intrinsic::x86_atomic_add_cc;
30838 break;
30839 case AtomicRMWInst::Sub:
30840 IID = Intrinsic::x86_atomic_sub_cc;
30841 break;
30842 case AtomicRMWInst::Or:
30843 IID = Intrinsic::x86_atomic_or_cc;
30844 break;
30845 case AtomicRMWInst::And:
30846 IID = Intrinsic::x86_atomic_and_cc;
30847 break;
30848 case AtomicRMWInst::Xor:
30849 IID = Intrinsic::x86_atomic_xor_cc;
30850 break;
30851 }
30852 Function *CmpArith =
30853 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
30854 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30856 Value *Call = Builder.CreateCall(
30857 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
30858 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
30859 ICI->replaceAllUsesWith(Result);
30860 ICI->eraseFromParent();
30861 if (TempI)
30862 TempI->eraseFromParent();
30863 AI->eraseFromParent();
30864}
30865
30867X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
30868 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30869 Type *MemType = AI->getType();
30870
30871 // If the operand is too big, we must see if cmpxchg8/16b is available
30872 // and default to library calls otherwise.
30873 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30874 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30876 }
30877
30879 switch (Op) {
30882 case AtomicRMWInst::Add:
30883 case AtomicRMWInst::Sub:
30886 // It's better to use xadd, xsub or xchg for these in other cases.
30888 case AtomicRMWInst::Or:
30889 case AtomicRMWInst::And:
30890 case AtomicRMWInst::Xor:
30893 return shouldExpandLogicAtomicRMWInIR(AI);
30895 case AtomicRMWInst::Max:
30896 case AtomicRMWInst::Min:
30905 default:
30906 // These always require a non-trivial set of data operations on x86. We must
30907 // use a cmpxchg loop.
30909 }
30910}
30911
30912LoadInst *
30913X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
30914 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30915 Type *MemType = AI->getType();
30916 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
30917 // there is no benefit in turning such RMWs into loads, and it is actually
30918 // harmful as it introduces a mfence.
30919 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30920 return nullptr;
30921
30922 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
30923 // lowering available in lowerAtomicArith.
30924 // TODO: push more cases through this path.
30925 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30926 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30927 AI->use_empty())
30928 return nullptr;
30929
30930 IRBuilder<> Builder(AI);
30931 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30932 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30933 auto SSID = AI->getSyncScopeID();
30934 // We must restrict the ordering to avoid generating loads with Release or
30935 // ReleaseAcquire orderings.
30937
30938 // Before the load we need a fence. Here is an example lifted from
30939 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30940 // is required:
30941 // Thread 0:
30942 // x.store(1, relaxed);
30943 // r1 = y.fetch_add(0, release);
30944 // Thread 1:
30945 // y.fetch_add(42, acquire);
30946 // r2 = x.load(relaxed);
30947 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
30948 // lowered to just a load without a fence. A mfence flushes the store buffer,
30949 // making the optimization clearly correct.
30950 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
30951 // otherwise, we might be able to be more aggressive on relaxed idempotent
30952 // rmw. In practice, they do not look useful, so we don't try to be
30953 // especially clever.
30954 if (SSID == SyncScope::SingleThread)
30955 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
30956 // the IR level, so we must wrap it in an intrinsic.
30957 return nullptr;
30958
30959 if (!Subtarget.hasMFence())
30960 // FIXME: it might make sense to use a locked operation here but on a
30961 // different cache-line to prevent cache-line bouncing. In practice it
30962 // is probably a small win, and x86 processors without mfence are rare
30963 // enough that we do not bother.
30964 return nullptr;
30965
30966 Function *MFence =
30967 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30968 Builder.CreateCall(MFence, {});
30969
30970 // Finally we can emit the atomic load.
30971 LoadInst *Loaded = Builder.CreateAlignedLoad(
30972 AI->getType(), AI->getPointerOperand(), AI->getAlign());
30973 Loaded->setAtomic(Order, SSID);
30974 AI->replaceAllUsesWith(Loaded);
30975 AI->eraseFromParent();
30976 return Loaded;
30977}
30978
30979/// Emit a locked operation on a stack location which does not change any
30980/// memory location, but does involve a lock prefix. Location is chosen to be
30981/// a) very likely accessed only by a single thread to minimize cache traffic,
30982/// and b) definitely dereferenceable. Returns the new Chain result.
30984 const X86Subtarget &Subtarget, SDValue Chain,
30985 const SDLoc &DL) {
30986 // Implementation notes:
30987 // 1) LOCK prefix creates a full read/write reordering barrier for memory
30988 // operations issued by the current processor. As such, the location
30989 // referenced is not relevant for the ordering properties of the instruction.
30990 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
30991 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
30992 // 2) Using an immediate operand appears to be the best encoding choice
30993 // here since it doesn't require an extra register.
30994 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
30995 // is small enough it might just be measurement noise.)
30996 // 4) When choosing offsets, there are several contributing factors:
30997 // a) If there's no redzone, we default to TOS. (We could allocate a cache
30998 // line aligned stack object to improve this case.)
30999 // b) To minimize our chances of introducing a false dependence, we prefer
31000 // to offset the stack usage from TOS slightly.
31001 // c) To minimize concerns about cross thread stack usage - in particular,
31002 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
31003 // captures state in the TOS frame and accesses it from many threads -
31004 // we want to use an offset such that the offset is in a distinct cache
31005 // line from the TOS frame.
31006 //
31007 // For a general discussion of the tradeoffs and benchmark results, see:
31008 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
31009
31010 auto &MF = DAG.getMachineFunction();
31011 auto &TFL = *Subtarget.getFrameLowering();
31012 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
31013
31014 if (Subtarget.is64Bit()) {
31015 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
31016 SDValue Ops[] = {
31017 DAG.getRegister(X86::RSP, MVT::i64), // Base
31018 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
31019 DAG.getRegister(0, MVT::i64), // Index
31020 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
31021 DAG.getRegister(0, MVT::i16), // Segment.
31022 Zero,
31023 Chain};
31024 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
31025 MVT::Other, Ops);
31026 return SDValue(Res, 1);
31027 }
31028
31029 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
31030 SDValue Ops[] = {
31031 DAG.getRegister(X86::ESP, MVT::i32), // Base
31032 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
31033 DAG.getRegister(0, MVT::i32), // Index
31034 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
31035 DAG.getRegister(0, MVT::i16), // Segment.
31036 Zero,
31037 Chain
31038 };
31039 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
31040 MVT::Other, Ops);
31041 return SDValue(Res, 1);
31042}
31043
31045 SelectionDAG &DAG) {
31046 SDLoc dl(Op);
31047 AtomicOrdering FenceOrdering =
31048 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
31049 SyncScope::ID FenceSSID =
31050 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
31051
31052 // The only fence that needs an instruction is a sequentially-consistent
31053 // cross-thread fence.
31054 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
31055 FenceSSID == SyncScope::System) {
31056 if (Subtarget.hasMFence())
31057 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
31058
31059 SDValue Chain = Op.getOperand(0);
31060 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
31061 }
31062
31063 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31064 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
31065}
31066
31068 SelectionDAG &DAG) {
31069 MVT T = Op.getSimpleValueType();
31070 SDLoc DL(Op);
31071 unsigned Reg = 0;
31072 unsigned size = 0;
31073 switch(T.SimpleTy) {
31074 default: llvm_unreachable("Invalid value type!");
31075 case MVT::i8: Reg = X86::AL; size = 1; break;
31076 case MVT::i16: Reg = X86::AX; size = 2; break;
31077 case MVT::i32: Reg = X86::EAX; size = 4; break;
31078 case MVT::i64:
31079 assert(Subtarget.is64Bit() && "Node not type legal!");
31080 Reg = X86::RAX; size = 8;
31081 break;
31082 }
31083 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
31084 Op.getOperand(2), SDValue());
31085 SDValue Ops[] = { cpIn.getValue(0),
31086 Op.getOperand(1),
31087 Op.getOperand(3),
31088 DAG.getTargetConstant(size, DL, MVT::i8),
31089 cpIn.getValue(1) };
31090 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31091 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
31093 Ops, T, MMO);
31094
31095 SDValue cpOut =
31096 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
31097 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
31098 MVT::i32, cpOut.getValue(2));
31099 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
31100
31101 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
31102 cpOut, Success, EFLAGS.getValue(1));
31103}
31104
31105// Create MOVMSKB, taking into account whether we need to split for AVX1.
31107 const X86Subtarget &Subtarget) {
31108 MVT InVT = V.getSimpleValueType();
31109
31110 if (InVT == MVT::v64i8) {
31111 SDValue Lo, Hi;
31112 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31113 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
31114 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
31115 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
31116 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
31117 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
31118 DAG.getConstant(32, DL, MVT::i8));
31119 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
31120 }
31121 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
31122 SDValue Lo, Hi;
31123 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31124 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
31125 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
31126 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
31127 DAG.getConstant(16, DL, MVT::i8));
31128 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
31129 }
31130
31131 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31132}
31133
31134static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
31135 SelectionDAG &DAG) {
31136 SDValue Src = Op.getOperand(0);
31137 MVT SrcVT = Src.getSimpleValueType();
31138 MVT DstVT = Op.getSimpleValueType();
31139
31140 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
31141 // half to v32i1 and concatenating the result.
31142 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
31143 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31144 assert(Subtarget.hasBWI() && "Expected BWI target");
31145 SDLoc dl(Op);
31146 SDValue Lo, Hi;
31147 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
31148 Lo = DAG.getBitcast(MVT::v32i1, Lo);
31149 Hi = DAG.getBitcast(MVT::v32i1, Hi);
31150 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
31151 }
31152
31153 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
31154 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
31155 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
31156 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
31157 SDLoc DL(Op);
31158 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
31159 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31160 return DAG.getZExtOrTrunc(V, DL, DstVT);
31161 }
31162
31163 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
31164 SrcVT == MVT::i64) && "Unexpected VT!");
31165
31166 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31167 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
31168 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
31169 // This conversion needs to be expanded.
31170 return SDValue();
31171
31172 SDLoc dl(Op);
31173 if (SrcVT.isVector()) {
31174 // Widen the vector in input in the case of MVT::v2i32.
31175 // Example: from MVT::v2i32 to MVT::v4i32.
31177 SrcVT.getVectorNumElements() * 2);
31178 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
31179 DAG.getUNDEF(SrcVT));
31180 } else {
31181 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
31182 "Unexpected source type in LowerBITCAST");
31183 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
31184 }
31185
31186 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
31187 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
31188
31189 if (DstVT == MVT::x86mmx)
31190 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
31191
31192 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
31193 DAG.getIntPtrConstant(0, dl));
31194}
31195
31196/// Compute the horizontal sum of bytes in V for the elements of VT.
31197///
31198/// Requires V to be a byte vector and VT to be an integer vector type with
31199/// wider elements than V's type. The width of the elements of VT determines
31200/// how many bytes of V are summed horizontally to produce each element of the
31201/// result.
31203 const X86Subtarget &Subtarget,
31204 SelectionDAG &DAG) {
31205 SDLoc DL(V);
31206 MVT ByteVecVT = V.getSimpleValueType();
31207 MVT EltVT = VT.getVectorElementType();
31208 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
31209 "Expected value to have byte element type.");
31210 assert(EltVT != MVT::i8 &&
31211 "Horizontal byte sum only makes sense for wider elements!");
31212 unsigned VecSize = VT.getSizeInBits();
31213 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
31214
31215 // PSADBW instruction horizontally add all bytes and leave the result in i64
31216 // chunks, thus directly computes the pop count for v2i64 and v4i64.
31217 if (EltVT == MVT::i64) {
31218 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
31219 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31220 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
31221 return DAG.getBitcast(VT, V);
31222 }
31223
31224 if (EltVT == MVT::i32) {
31225 // We unpack the low half and high half into i32s interleaved with zeros so
31226 // that we can use PSADBW to horizontally sum them. The most useful part of
31227 // this is that it lines up the results of two PSADBW instructions to be
31228 // two v2i64 vectors which concatenated are the 4 population counts. We can
31229 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
31230 SDValue Zeros = DAG.getConstant(0, DL, VT);
31231 SDValue V32 = DAG.getBitcast(VT, V);
31232 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
31233 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
31234
31235 // Do the horizontal sums into two v2i64s.
31236 Zeros = DAG.getConstant(0, DL, ByteVecVT);
31237 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31238 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31239 DAG.getBitcast(ByteVecVT, Low), Zeros);
31240 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31241 DAG.getBitcast(ByteVecVT, High), Zeros);
31242
31243 // Merge them together.
31244 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
31245 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
31246 DAG.getBitcast(ShortVecVT, Low),
31247 DAG.getBitcast(ShortVecVT, High));
31248
31249 return DAG.getBitcast(VT, V);
31250 }
31251
31252 // The only element type left is i16.
31253 assert(EltVT == MVT::i16 && "Unknown how to handle type");
31254
31255 // To obtain pop count for each i16 element starting from the pop count for
31256 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
31257 // right by 8. It is important to shift as i16s as i8 vector shift isn't
31258 // directly supported.
31259 SDValue ShifterV = DAG.getConstant(8, DL, VT);
31260 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31261 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
31262 DAG.getBitcast(ByteVecVT, V));
31263 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31264}
31265
31267 const X86Subtarget &Subtarget,
31268 SelectionDAG &DAG) {
31269 MVT VT = Op.getSimpleValueType();
31270 MVT EltVT = VT.getVectorElementType();
31271 int NumElts = VT.getVectorNumElements();
31272 (void)EltVT;
31273 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
31274
31275 // Implement a lookup table in register by using an algorithm based on:
31276 // http://wm.ite.pl/articles/sse-popcount.html
31277 //
31278 // The general idea is that every lower byte nibble in the input vector is an
31279 // index into a in-register pre-computed pop count table. We then split up the
31280 // input vector in two new ones: (1) a vector with only the shifted-right
31281 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
31282 // masked out higher ones) for each byte. PSHUFB is used separately with both
31283 // to index the in-register table. Next, both are added and the result is a
31284 // i8 vector where each element contains the pop count for input byte.
31285 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
31286 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
31287 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
31288 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
31289
31291 for (int i = 0; i < NumElts; ++i)
31292 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
31293 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
31294 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
31295
31296 // High nibbles
31297 SDValue FourV = DAG.getConstant(4, DL, VT);
31298 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
31299
31300 // Low nibbles
31301 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
31302
31303 // The input vector is used as the shuffle mask that index elements into the
31304 // LUT. After counting low and high nibbles, add the vector to obtain the
31305 // final pop count per i8 element.
31306 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
31307 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
31308 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
31309}
31310
31311// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
31312// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
31314 const X86Subtarget &Subtarget,
31315 SelectionDAG &DAG) {
31316 MVT VT = Op.getSimpleValueType();
31317 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
31318 "Unknown CTPOP type to handle");
31319 SDValue Op0 = Op.getOperand(0);
31320
31321 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
31322 if (Subtarget.hasVPOPCNTDQ()) {
31323 unsigned NumElems = VT.getVectorNumElements();
31324 assert((VT.getVectorElementType() == MVT::i8 ||
31325 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
31326 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
31327 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
31328 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
31329 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
31330 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
31331 }
31332 }
31333
31334 // Decompose 256-bit ops into smaller 128-bit ops.
31335 if (VT.is256BitVector() && !Subtarget.hasInt256())
31336 return splitVectorIntUnary(Op, DAG, DL);
31337
31338 // Decompose 512-bit ops into smaller 256-bit ops.
31339 if (VT.is512BitVector() && !Subtarget.hasBWI())
31340 return splitVectorIntUnary(Op, DAG, DL);
31341
31342 // For element types greater than i8, do vXi8 pop counts and a bytesum.
31343 if (VT.getScalarType() != MVT::i8) {
31344 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31345 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
31346 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
31347 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
31348 }
31349
31350 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
31351 if (!Subtarget.hasSSSE3())
31352 return SDValue();
31353
31354 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
31355}
31356
31357static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
31358 SelectionDAG &DAG) {
31359 MVT VT = N.getSimpleValueType();
31360 SDValue Op = N.getOperand(0);
31361 SDLoc DL(N);
31362
31363 if (VT.isScalarInteger()) {
31364 // Compute the lower/upper bounds of the active bits of the value,
31365 // allowing us to shift the active bits down if necessary to fit into the
31366 // special cases below.
31367 KnownBits Known = DAG.computeKnownBits(Op);
31368 unsigned LZ = Known.countMinLeadingZeros();
31369 unsigned TZ = Known.countMinTrailingZeros();
31370 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
31371 unsigned ActiveBits = Known.getBitWidth() - LZ;
31372 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
31373
31374 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
31375 if (ShiftedActiveBits <= 2) {
31376 if (ActiveBits > 2)
31377 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31378 DAG.getShiftAmountConstant(TZ, VT, DL));
31379 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31380 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
31381 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31382 DAG.getShiftAmountConstant(1, VT, DL)));
31383 return DAG.getZExtOrTrunc(Op, DL, VT);
31384 }
31385
31386 // i3 CTPOP - perform LUT into i32 integer.
31387 if (ShiftedActiveBits <= 3) {
31388 if (ActiveBits > 3)
31389 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31390 DAG.getShiftAmountConstant(TZ, VT, DL));
31391 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31392 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
31393 DAG.getShiftAmountConstant(1, VT, DL));
31394 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
31395 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
31396 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
31397 DAG.getConstant(0x3, DL, MVT::i32));
31398 return DAG.getZExtOrTrunc(Op, DL, VT);
31399 }
31400
31401 // i4 CTPOP - perform LUT into i64 integer.
31402 if (ShiftedActiveBits <= 4 &&
31403 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
31404 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
31405 if (ActiveBits > 4)
31406 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31407 DAG.getShiftAmountConstant(TZ, VT, DL));
31408 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31409 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31410 DAG.getConstant(4, DL, MVT::i32));
31411 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
31412 DAG.getShiftAmountOperand(MVT::i64, Op));
31413 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
31414 DAG.getConstant(0x7, DL, MVT::i64));
31415 return DAG.getZExtOrTrunc(Op, DL, VT);
31416 }
31417
31418 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
31419 if (ShiftedActiveBits <= 8) {
31420 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
31421 if (ActiveBits > 8)
31422 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31423 DAG.getShiftAmountConstant(TZ, VT, DL));
31424 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31425 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31426 DAG.getConstant(0x08040201U, DL, MVT::i32));
31427 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31428 DAG.getShiftAmountConstant(3, MVT::i32, DL));
31429 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
31430 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
31431 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31432 DAG.getShiftAmountConstant(28, MVT::i32, DL));
31433 return DAG.getZExtOrTrunc(Op, DL, VT);
31434 }
31435
31436 return SDValue(); // fallback to generic expansion.
31437 }
31438
31439 assert(VT.isVector() &&
31440 "We only do custom lowering for vector population count.");
31441 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
31442}
31443
31445 MVT VT = Op.getSimpleValueType();
31446 SDValue In = Op.getOperand(0);
31447 SDLoc DL(Op);
31448
31449 // For scalars, its still beneficial to transfer to/from the SIMD unit to
31450 // perform the BITREVERSE.
31451 if (!VT.isVector()) {
31452 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31453 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31454 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
31455 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
31456 DAG.getIntPtrConstant(0, DL));
31457 }
31458
31459 int NumElts = VT.getVectorNumElements();
31460 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
31461
31462 // Decompose 256-bit ops into smaller 128-bit ops.
31463 if (VT.is256BitVector())
31464 return splitVectorIntUnary(Op, DAG, DL);
31465
31466 assert(VT.is128BitVector() &&
31467 "Only 128-bit vector bitreverse lowering supported.");
31468
31469 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
31470 // perform the BSWAP in the shuffle.
31471 // Its best to shuffle using the second operand as this will implicitly allow
31472 // memory folding for multiple vectors.
31473 SmallVector<SDValue, 16> MaskElts;
31474 for (int i = 0; i != NumElts; ++i) {
31475 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
31476 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
31477 int PermuteByte = SourceByte | (2 << 5);
31478 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
31479 }
31480 }
31481
31482 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
31483 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
31484 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
31485 Res, Mask);
31486 return DAG.getBitcast(VT, Res);
31487}
31488
31490 SelectionDAG &DAG) {
31491 MVT VT = Op.getSimpleValueType();
31492
31493 if (Subtarget.hasXOP() && !VT.is512BitVector())
31494 return LowerBITREVERSE_XOP(Op, DAG);
31495
31496 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
31497
31498 SDValue In = Op.getOperand(0);
31499 SDLoc DL(Op);
31500
31501 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
31502 if (VT.is512BitVector() && !Subtarget.hasBWI())
31503 return splitVectorIntUnary(Op, DAG, DL);
31504
31505 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
31506 if (VT.is256BitVector() && !Subtarget.hasInt256())
31507 return splitVectorIntUnary(Op, DAG, DL);
31508
31509 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
31510 if (!VT.isVector()) {
31511 assert(
31512 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
31513 "Only tested for i8/i16/i32/i64");
31514 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31515 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31516 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
31517 DAG.getBitcast(MVT::v16i8, Res));
31518 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
31519 DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL));
31520 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
31521 }
31522
31523 assert(VT.isVector() && VT.getSizeInBits() >= 128);
31524
31525 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
31526 if (VT.getScalarType() != MVT::i8) {
31527 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31528 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
31529 Res = DAG.getBitcast(ByteVT, Res);
31530 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
31531 return DAG.getBitcast(VT, Res);
31532 }
31533 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
31534 "Only byte vector BITREVERSE supported");
31535
31536 unsigned NumElts = VT.getVectorNumElements();
31537
31538 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
31539 if (Subtarget.hasGFNI()) {
31541 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
31542 DAG.getTargetConstant(0, DL, MVT::i8));
31543 }
31544
31545 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
31546 // two nibbles and a PSHUFB lookup to find the bitreverse of each
31547 // 0-15 value (moved to the other nibble).
31548 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
31549 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
31550 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
31551
31552 const int LoLUT[16] = {
31553 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
31554 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
31555 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
31556 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
31557 const int HiLUT[16] = {
31558 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
31559 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
31560 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
31561 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
31562
31563 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
31564 for (unsigned i = 0; i < NumElts; ++i) {
31565 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
31566 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
31567 }
31568
31569 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
31570 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
31571 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
31572 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
31573 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31574}
31575
31576static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
31577 SelectionDAG &DAG) {
31578 SDLoc DL(Op);
31579 SDValue X = Op.getOperand(0);
31580 MVT VT = Op.getSimpleValueType();
31581
31582 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
31583 if (VT == MVT::i8 ||
31585 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31586 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
31587 DAG.getConstant(0, DL, MVT::i8));
31588 // Copy the inverse of the parity flag into a register with setcc.
31589 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31590 // Extend to the original type.
31591 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31592 }
31593
31594 // If we have POPCNT, use the default expansion.
31595 if (Subtarget.hasPOPCNT())
31596 return SDValue();
31597
31598 if (VT == MVT::i64) {
31599 // Xor the high and low 16-bits together using a 32-bit operation.
31600 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
31601 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
31602 DAG.getConstant(32, DL, MVT::i8)));
31603 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
31604 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
31605 }
31606
31607 if (VT != MVT::i16) {
31608 // Xor the high and low 16-bits together using a 32-bit operation.
31609 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
31610 DAG.getConstant(16, DL, MVT::i8));
31611 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
31612 } else {
31613 // If the input is 16-bits, we need to extend to use an i32 shift below.
31614 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
31615 }
31616
31617 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
31618 // This should allow an h-reg to be used to save a shift.
31619 SDValue Hi = DAG.getNode(
31620 ISD::TRUNCATE, DL, MVT::i8,
31621 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
31622 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31623 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
31624 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
31625
31626 // Copy the inverse of the parity flag into a register with setcc.
31627 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31628 // Extend to the original type.
31629 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31630}
31631
31633 const X86Subtarget &Subtarget) {
31634 unsigned NewOpc = 0;
31635 switch (N->getOpcode()) {
31637 NewOpc = X86ISD::LADD;
31638 break;
31640 NewOpc = X86ISD::LSUB;
31641 break;
31643 NewOpc = X86ISD::LOR;
31644 break;
31646 NewOpc = X86ISD::LXOR;
31647 break;
31649 NewOpc = X86ISD::LAND;
31650 break;
31651 default:
31652 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
31653 }
31654
31655 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31656
31657 return DAG.getMemIntrinsicNode(
31658 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
31659 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31660 /*MemVT=*/N->getSimpleValueType(0), MMO);
31661}
31662
31663/// Lower atomic_load_ops into LOCK-prefixed operations.
31665 const X86Subtarget &Subtarget) {
31666 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
31667 SDValue Chain = N->getOperand(0);
31668 SDValue LHS = N->getOperand(1);
31669 SDValue RHS = N->getOperand(2);
31670 unsigned Opc = N->getOpcode();
31671 MVT VT = N->getSimpleValueType(0);
31672 SDLoc DL(N);
31673
31674 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
31675 // can only be lowered when the result is unused. They should have already
31676 // been transformed into a cmpxchg loop in AtomicExpand.
31677 if (N->hasAnyUseOfValue(0)) {
31678 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31679 // select LXADD if LOCK_SUB can't be selected.
31680 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
31681 // can use LXADD as opposed to cmpxchg.
31682 if (Opc == ISD::ATOMIC_LOAD_SUB ||
31684 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
31685 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
31686
31688 "Used AtomicRMW ops other than Add should have been expanded!");
31689 return N;
31690 }
31691
31692 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
31693 // The core idea here is that since the memory location isn't actually
31694 // changing, all we need is a lowering for the *ordering* impacts of the
31695 // atomicrmw. As such, we can chose a different operation and memory
31696 // location to minimize impact on other code.
31697 // The above holds unless the node is marked volatile in which
31698 // case it needs to be preserved according to the langref.
31699 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
31700 // On X86, the only ordering which actually requires an instruction is
31701 // seq_cst which isn't SingleThread, everything just needs to be preserved
31702 // during codegen and then dropped. Note that we expect (but don't assume),
31703 // that orderings other than seq_cst and acq_rel have been canonicalized to
31704 // a store or load.
31707 // Prefer a locked operation against a stack location to minimize cache
31708 // traffic. This assumes that stack locations are very likely to be
31709 // accessed only by the owning thread.
31710 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
31711 assert(!N->hasAnyUseOfValue(0));
31712 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31713 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31714 DAG.getUNDEF(VT), NewChain);
31715 }
31716 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31717 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
31718 assert(!N->hasAnyUseOfValue(0));
31719 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31720 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31721 DAG.getUNDEF(VT), NewChain);
31722 }
31723
31724 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
31725 // RAUW the chain, but don't worry about the result, as it's unused.
31726 assert(!N->hasAnyUseOfValue(0));
31727 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31728 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31729 DAG.getUNDEF(VT), LockOp.getValue(1));
31730}
31731
31733 const X86Subtarget &Subtarget) {
31734 auto *Node = cast<AtomicSDNode>(Op.getNode());
31735 SDLoc dl(Node);
31736 EVT VT = Node->getMemoryVT();
31737
31738 bool IsSeqCst =
31739 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31740 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
31741
31742 // If this store is not sequentially consistent and the type is legal
31743 // we can just keep it.
31744 if (!IsSeqCst && IsTypeLegal)
31745 return Op;
31746
31747 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
31749 Attribute::NoImplicitFloat)) {
31750 SDValue Chain;
31751 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
31752 // vector store.
31753 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
31754 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
31755 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
31756 Node->getMemOperand());
31757 }
31758
31759 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
31760 // is enabled.
31761 if (VT == MVT::i64) {
31762 if (Subtarget.hasSSE1()) {
31763 SDValue SclToVec =
31764 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
31765 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31766 SclToVec = DAG.getBitcast(StVT, SclToVec);
31767 SDVTList Tys = DAG.getVTList(MVT::Other);
31768 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31769 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
31770 MVT::i64, Node->getMemOperand());
31771 } else if (Subtarget.hasX87()) {
31772 // First load this into an 80-bit X87 register using a stack temporary.
31773 // This will put the whole integer into the significand.
31774 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31775 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31776 MachinePointerInfo MPI =
31778 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
31780 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31781 SDValue LdOps[] = {Chain, StackPtr};
31783 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
31784 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
31785 Chain = Value.getValue(1);
31786
31787 // Now use an FIST to do the atomic store.
31788 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31789 Chain =
31790 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
31791 StoreOps, MVT::i64, Node->getMemOperand());
31792 }
31793 }
31794
31795 if (Chain) {
31796 // If this is a sequentially consistent store, also emit an appropriate
31797 // barrier.
31798 if (IsSeqCst)
31799 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31800
31801 return Chain;
31802 }
31803 }
31804
31805 // Convert seq_cst store -> xchg
31806 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31807 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31808 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
31809 Node->getOperand(0), Node->getOperand(2),
31810 Node->getOperand(1), Node->getMemOperand());
31811 return Swap.getValue(1);
31812}
31813
31815 SDNode *N = Op.getNode();
31816 MVT VT = N->getSimpleValueType(0);
31817 unsigned Opc = Op.getOpcode();
31818
31819 // Let legalize expand this if it isn't a legal type yet.
31820 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31821 return SDValue();
31822
31823 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
31824 SDLoc DL(N);
31825
31826 // Set the carry flag.
31827 SDValue Carry = Op.getOperand(2);
31828 EVT CarryVT = Carry.getValueType();
31829 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
31830 Carry, DAG.getAllOnesConstant(DL, CarryVT));
31831
31832 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
31833 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
31834 Op.getOperand(0), Op.getOperand(1),
31835 Carry.getValue(1));
31836
31837 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
31838 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
31839 Sum.getValue(1), DL, DAG);
31840 if (N->getValueType(1) == MVT::i1)
31841 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
31842
31843 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31844}
31845
31846static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
31847 SelectionDAG &DAG) {
31848 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
31849
31850 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
31851 // which returns the values as { float, float } (in XMM0) or
31852 // { double, double } (which is returned in XMM0, XMM1).
31853 SDLoc dl(Op);
31854 SDValue Arg = Op.getOperand(0);
31855 EVT ArgVT = Arg.getValueType();
31856 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31857
31860
31861 Entry.Node = Arg;
31862 Entry.Ty = ArgTy;
31863 Entry.IsSExt = false;
31864 Entry.IsZExt = false;
31865 Args.push_back(Entry);
31866
31867 bool isF64 = ArgVT == MVT::f64;
31868 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
31869 // the small struct {f32, f32} is returned in (eax, edx). For f64,
31870 // the results are returned via SRet in memory.
31871 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31872 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
31873 const char *LibcallName = TLI.getLibcallName(LC);
31874 SDValue Callee =
31875 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
31876
31877 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
31878 : (Type *)FixedVectorType::get(ArgTy, 4);
31879
31881 CLI.setDebugLoc(dl)
31882 .setChain(DAG.getEntryNode())
31883 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
31884
31885 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
31886
31887 if (isF64)
31888 // Returned in xmm0 and xmm1.
31889 return CallResult.first;
31890
31891 // Returned in bits 0:31 and 32:64 xmm0.
31892 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31893 CallResult.first, DAG.getIntPtrConstant(0, dl));
31894 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31895 CallResult.first, DAG.getIntPtrConstant(1, dl));
31896 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
31897 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
31898}
31899
31900/// Widen a vector input to a vector of NVT. The
31901/// input vector must have the same element type as NVT.
31903 bool FillWithZeroes = false) {
31904 // Check if InOp already has the right width.
31905 MVT InVT = InOp.getSimpleValueType();
31906 if (InVT == NVT)
31907 return InOp;
31908
31909 if (InOp.isUndef())
31910 return DAG.getUNDEF(NVT);
31911
31913 "input and widen element type must match");
31914
31915 unsigned InNumElts = InVT.getVectorNumElements();
31916 unsigned WidenNumElts = NVT.getVectorNumElements();
31917 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
31918 "Unexpected request for vector widening");
31919
31920 SDLoc dl(InOp);
31921 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
31922 InOp.getNumOperands() == 2) {
31923 SDValue N1 = InOp.getOperand(1);
31924 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
31925 N1.isUndef()) {
31926 InOp = InOp.getOperand(0);
31927 InVT = InOp.getSimpleValueType();
31928 InNumElts = InVT.getVectorNumElements();
31929 }
31930 }
31934 for (unsigned i = 0; i < InNumElts; ++i)
31935 Ops.push_back(InOp.getOperand(i));
31936
31937 EVT EltVT = InOp.getOperand(0).getValueType();
31938
31939 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
31940 DAG.getUNDEF(EltVT);
31941 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31942 Ops.push_back(FillVal);
31943 return DAG.getBuildVector(NVT, dl, Ops);
31944 }
31945 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
31946 DAG.getUNDEF(NVT);
31947 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
31948 InOp, DAG.getIntPtrConstant(0, dl));
31949}
31950
31952 SelectionDAG &DAG) {
31953 assert(Subtarget.hasAVX512() &&
31954 "MGATHER/MSCATTER are supported on AVX-512 arch only");
31955
31956 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
31957 SDValue Src = N->getValue();
31958 MVT VT = Src.getSimpleValueType();
31959 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
31960 SDLoc dl(Op);
31961
31962 SDValue Scale = N->getScale();
31963 SDValue Index = N->getIndex();
31964 SDValue Mask = N->getMask();
31965 SDValue Chain = N->getChain();
31966 SDValue BasePtr = N->getBasePtr();
31967
31968 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
31969 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31970 // If the index is v2i64 and we have VLX we can use xmm for data and index.
31971 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
31972 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31973 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
31974 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
31975 SDVTList VTs = DAG.getVTList(MVT::Other);
31976 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31977 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31978 N->getMemoryVT(), N->getMemOperand());
31979 }
31980 return SDValue();
31981 }
31982
31983 MVT IndexVT = Index.getSimpleValueType();
31984
31985 // If the index is v2i32, we're being called by type legalization and we
31986 // should just let the default handling take care of it.
31987 if (IndexVT == MVT::v2i32)
31988 return SDValue();
31989
31990 // If we don't have VLX and neither the passthru or index is 512-bits, we
31991 // need to widen until one is.
31992 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
31993 !Index.getSimpleValueType().is512BitVector()) {
31994 // Determine how much we need to widen by to get a 512-bit type.
31995 unsigned Factor = std::min(512/VT.getSizeInBits(),
31996 512/IndexVT.getSizeInBits());
31997 unsigned NumElts = VT.getVectorNumElements() * Factor;
31998
31999 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32000 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32001 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32002
32003 Src = ExtendToType(Src, VT, DAG);
32004 Index = ExtendToType(Index, IndexVT, DAG);
32005 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32006 }
32007
32008 SDVTList VTs = DAG.getVTList(MVT::Other);
32009 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
32010 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
32011 N->getMemoryVT(), N->getMemOperand());
32012}
32013
32014static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
32015 SelectionDAG &DAG) {
32016
32017 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
32018 MVT VT = Op.getSimpleValueType();
32019 MVT ScalarVT = VT.getScalarType();
32020 SDValue Mask = N->getMask();
32021 MVT MaskVT = Mask.getSimpleValueType();
32022 SDValue PassThru = N->getPassThru();
32023 SDLoc dl(Op);
32024
32025 // Handle AVX masked loads which don't support passthru other than 0.
32026 if (MaskVT.getVectorElementType() != MVT::i1) {
32027 // We also allow undef in the isel pattern.
32028 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
32029 return Op;
32030
32031 SDValue NewLoad = DAG.getMaskedLoad(
32032 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32033 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
32034 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
32035 N->isExpandingLoad());
32036 // Emit a blend.
32037 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
32038 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
32039 }
32040
32041 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
32042 "Expanding masked load is supported on AVX-512 target only!");
32043
32044 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
32045 "Expanding masked load is supported for 32 and 64-bit types only!");
32046
32047 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32048 "Cannot lower masked load op.");
32049
32050 assert((ScalarVT.getSizeInBits() >= 32 ||
32051 (Subtarget.hasBWI() &&
32052 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
32053 "Unsupported masked load op.");
32054
32055 // This operation is legal for targets with VLX, but without
32056 // VLX the vector should be widened to 512 bit
32057 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
32058 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32059 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
32060
32061 // Mask element has to be i1.
32062 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
32063 "Unexpected mask type");
32064
32065 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32066
32067 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32068 SDValue NewLoad = DAG.getMaskedLoad(
32069 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32070 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
32071 N->getExtensionType(), N->isExpandingLoad());
32072
32073 SDValue Extract =
32074 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
32075 DAG.getIntPtrConstant(0, dl));
32076 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
32077 return DAG.getMergeValues(RetOps, dl);
32078}
32079
32080static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
32081 SelectionDAG &DAG) {
32082 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
32083 SDValue DataToStore = N->getValue();
32084 MVT VT = DataToStore.getSimpleValueType();
32085 MVT ScalarVT = VT.getScalarType();
32086 SDValue Mask = N->getMask();
32087 SDLoc dl(Op);
32088
32089 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
32090 "Expanding masked load is supported on AVX-512 target only!");
32091
32092 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
32093 "Expanding masked load is supported for 32 and 64-bit types only!");
32094
32095 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32096 "Cannot lower masked store op.");
32097
32098 assert((ScalarVT.getSizeInBits() >= 32 ||
32099 (Subtarget.hasBWI() &&
32100 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
32101 "Unsupported masked store op.");
32102
32103 // This operation is legal for targets with VLX, but without
32104 // VLX the vector should be widened to 512 bit
32105 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
32106 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32107
32108 // Mask element has to be i1.
32109 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
32110 "Unexpected mask type");
32111
32112 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32113
32114 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
32115 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32116 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
32117 N->getOffset(), Mask, N->getMemoryVT(),
32118 N->getMemOperand(), N->getAddressingMode(),
32119 N->isTruncatingStore(), N->isCompressingStore());
32120}
32121
32122static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
32123 SelectionDAG &DAG) {
32124 assert(Subtarget.hasAVX2() &&
32125 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
32126
32127 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
32128 SDLoc dl(Op);
32129 MVT VT = Op.getSimpleValueType();
32130 SDValue Index = N->getIndex();
32131 SDValue Mask = N->getMask();
32132 SDValue PassThru = N->getPassThru();
32133 MVT IndexVT = Index.getSimpleValueType();
32134
32135 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
32136
32137 // If the index is v2i32, we're being called by type legalization.
32138 if (IndexVT == MVT::v2i32)
32139 return SDValue();
32140
32141 // If we don't have VLX and neither the passthru or index is 512-bits, we
32142 // need to widen until one is.
32143 MVT OrigVT = VT;
32144 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32145 !IndexVT.is512BitVector()) {
32146 // Determine how much we need to widen by to get a 512-bit type.
32147 unsigned Factor = std::min(512/VT.getSizeInBits(),
32148 512/IndexVT.getSizeInBits());
32149
32150 unsigned NumElts = VT.getVectorNumElements() * Factor;
32151
32152 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32153 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32154 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32155
32156 PassThru = ExtendToType(PassThru, VT, DAG);
32157 Index = ExtendToType(Index, IndexVT, DAG);
32158 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32159 }
32160
32161 // Break dependency on the data register.
32162 if (PassThru.isUndef())
32163 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
32164
32165 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
32166 N->getScale() };
32167 SDValue NewGather = DAG.getMemIntrinsicNode(
32168 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
32169 N->getMemOperand());
32170 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
32171 NewGather, DAG.getIntPtrConstant(0, dl));
32172 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
32173}
32174
32176 SDLoc dl(Op);
32177 SDValue Src = Op.getOperand(0);
32178 MVT DstVT = Op.getSimpleValueType();
32179
32180 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
32181 unsigned SrcAS = N->getSrcAddressSpace();
32182
32183 assert(SrcAS != N->getDestAddressSpace() &&
32184 "addrspacecast must be between different address spaces");
32185
32186 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
32187 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
32188 } else if (DstVT == MVT::i64) {
32189 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
32190 } else if (DstVT == MVT::i32) {
32191 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
32192 } else {
32193 report_fatal_error("Bad address space in addrspacecast");
32194 }
32195 return Op;
32196}
32197
32198SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
32199 SelectionDAG &DAG) const {
32200 // TODO: Eventually, the lowering of these nodes should be informed by or
32201 // deferred to the GC strategy for the function in which they appear. For
32202 // now, however, they must be lowered to something. Since they are logically
32203 // no-ops in the case of a null GC strategy (or a GC strategy which does not
32204 // require special handling for these nodes), lower them as literal NOOPs for
32205 // the time being.
32207 Ops.push_back(Op.getOperand(0));
32208 if (Op->getGluedNode())
32209 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
32210
32211 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
32212 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
32213}
32214
32215// Custom split CVTPS2PH with wide types.
32217 SDLoc dl(Op);
32218 EVT VT = Op.getValueType();
32219 SDValue Lo, Hi;
32220 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
32221 EVT LoVT, HiVT;
32222 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32223 SDValue RC = Op.getOperand(1);
32224 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
32225 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
32226 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32227}
32228
32230 SelectionDAG &DAG) {
32231 unsigned IsData = Op.getConstantOperandVal(4);
32232
32233 // We don't support non-data prefetch without PREFETCHI.
32234 // Just preserve the chain.
32235 if (!IsData && !Subtarget.hasPREFETCHI())
32236 return Op.getOperand(0);
32237
32238 return Op;
32239}
32240
32242 unsigned OpNo) {
32243 const APInt Operand(32, OpNo);
32244 std::string OpNoStr = llvm::toString(Operand, 10, false);
32245 std::string Str(" $");
32246
32247 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
32248 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
32249
32250 auto I = StringRef::npos;
32251 for (auto &AsmStr : AsmStrs) {
32252 // Match the OpNo string. We should match exactly to exclude match
32253 // sub-string, e.g. "$12" contain "$1"
32254 if (AsmStr.ends_with(OpNoStr1))
32255 I = AsmStr.size() - OpNoStr1.size();
32256
32257 // Get the index of operand in AsmStr.
32258 if (I == StringRef::npos)
32259 I = AsmStr.find(OpNoStr1 + ",");
32260 if (I == StringRef::npos)
32261 I = AsmStr.find(OpNoStr2);
32262
32263 if (I == StringRef::npos)
32264 continue;
32265
32266 assert(I > 0 && "Unexpected inline asm string!");
32267 // Remove the operand string and label (if exsit).
32268 // For example:
32269 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
32270 // ==>
32271 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
32272 // ==>
32273 // "call dword ptr "
32274 auto TmpStr = AsmStr.substr(0, I);
32275 I = TmpStr.rfind(':');
32276 if (I != StringRef::npos)
32277 TmpStr = TmpStr.substr(I + 1);
32278 return TmpStr.take_while(llvm::isAlpha);
32279 }
32280
32281 return StringRef();
32282}
32283
32285 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
32286 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
32287 // changed from indirect TargetLowering::C_Memory to direct
32288 // TargetLowering::C_Address.
32289 // We don't need to special case LOOP* and Jcc, which cannot target a memory
32290 // location.
32291 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
32292 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
32293}
32294
32296 SDValue Mask) {
32297 EVT Ty = MVT::i8;
32298 auto V = DAG.getBitcast(MVT::i1, Mask);
32299 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
32300 auto Zero = DAG.getConstant(0, DL, Ty);
32301 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
32302 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
32303 return SDValue(CmpZero.getNode(), 1);
32304}
32305
32307 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
32308 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
32309 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
32310 // ->
32311 // _, flags = SUB 0, mask
32312 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
32313 // bit_cast_to_vector<res>
32314 EVT VTy = PassThru.getValueType();
32315 EVT Ty = VTy.getVectorElementType();
32316 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
32317 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
32318 : DAG.getBitcast(Ty, PassThru);
32319 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
32320 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
32321 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
32322 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
32323 return DAG.getBitcast(VTy, NewLoad);
32324}
32325
32327 SDValue Chain,
32329 SDValue Val, SDValue Mask) const {
32330 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
32331 // ->
32332 // _, flags = SUB 0, mask
32333 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
32335 SDVTList Tys = DAG.getVTList(MVT::Other);
32336 auto ScalarVal = DAG.getBitcast(Ty, Val);
32337 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
32338 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
32339 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
32340 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
32341}
32342
32343/// Provide custom lowering hooks for some operations.
32345 switch (Op.getOpcode()) {
32346 // clang-format off
32347 default: llvm_unreachable("Should not custom lower this!");
32348 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
32350 return LowerCMP_SWAP(Op, Subtarget, DAG);
32351 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
32356 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
32357 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
32358 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
32359 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
32360 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
32361 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
32362 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
32363 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
32364 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
32365 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
32366 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
32367 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
32368 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
32369 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
32370 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
32371 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
32372 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
32373 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
32374 case ISD::SHL_PARTS:
32375 case ISD::SRA_PARTS:
32376 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
32377 case ISD::FSHL:
32378 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
32380 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
32382 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
32383 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
32384 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
32385 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
32386 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
32389 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
32390 case ISD::FP_TO_SINT:
32392 case ISD::FP_TO_UINT:
32393 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
32395 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
32396 case ISD::FP_EXTEND:
32397 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
32398 case ISD::FP_ROUND:
32399 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
32400 case ISD::FP16_TO_FP:
32401 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
32402 case ISD::FP_TO_FP16:
32403 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
32404 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
32405 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
32406 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
32407 case ISD::FADD:
32408 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
32409 case ISD::FROUND: return LowerFROUND(Op, DAG);
32410 case ISD::FABS:
32411 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
32412 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
32413 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
32414 case ISD::LRINT:
32415 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
32416 case ISD::SETCC:
32417 case ISD::STRICT_FSETCC:
32418 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
32419 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
32420 case ISD::SELECT: return LowerSELECT(Op, DAG);
32421 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
32422 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
32423 case ISD::VASTART: return LowerVASTART(Op, DAG);
32424 case ISD::VAARG: return LowerVAARG(Op, DAG);
32425 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
32426 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
32428 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
32429 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
32430 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
32431 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
32433 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
32434 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
32435 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
32436 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
32437 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
32439 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
32440 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
32442 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
32443 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
32444 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
32445 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
32446 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
32447 case ISD::CTLZ:
32448 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
32449 case ISD::CTTZ:
32450 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
32451 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
32452 case ISD::MULHS:
32453 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
32454 case ISD::ROTL:
32455 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
32456 case ISD::SRA:
32457 case ISD::SRL:
32458 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
32459 case ISD::SADDO:
32460 case ISD::UADDO:
32461 case ISD::SSUBO:
32462 case ISD::USUBO: return LowerXALUO(Op, DAG);
32463 case ISD::SMULO:
32464 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
32465 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
32466 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
32467 case ISD::SADDO_CARRY:
32468 case ISD::SSUBO_CARRY:
32469 case ISD::UADDO_CARRY:
32470 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
32471 case ISD::ADD:
32472 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
32473 case ISD::UADDSAT:
32474 case ISD::SADDSAT:
32475 case ISD::USUBSAT:
32476 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
32477 case ISD::SMAX:
32478 case ISD::SMIN:
32479 case ISD::UMAX:
32480 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
32481 case ISD::FMINIMUM:
32482 case ISD::FMAXIMUM:
32483 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
32484 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
32485 case ISD::ABDS:
32486 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
32487 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
32488 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
32489 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
32490 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
32491 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
32492 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
32494 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
32495 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
32496 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
32497 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
32498 // clang-format on
32499 }
32500}
32501
32502/// Replace a node with an illegal result type with a new node built out of
32503/// custom code.
32506 SelectionDAG &DAG) const {
32507 SDLoc dl(N);
32508 switch (N->getOpcode()) {
32509 default:
32510#ifndef NDEBUG
32511 dbgs() << "ReplaceNodeResults: ";
32512 N->dump(&DAG);
32513#endif
32514 llvm_unreachable("Do not know how to custom type legalize this operation!");
32515 case X86ISD::CVTPH2PS: {
32516 EVT VT = N->getValueType(0);
32517 SDValue Lo, Hi;
32518 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
32519 EVT LoVT, HiVT;
32520 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32521 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
32522 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
32523 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32524 Results.push_back(Res);
32525 return;
32526 }
32528 EVT VT = N->getValueType(0);
32529 SDValue Lo, Hi;
32530 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
32531 EVT LoVT, HiVT;
32532 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32533 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
32534 {N->getOperand(0), Lo});
32535 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
32536 {N->getOperand(0), Hi});
32537 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32538 Lo.getValue(1), Hi.getValue(1));
32539 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32540 Results.push_back(Res);
32541 Results.push_back(Chain);
32542 return;
32543 }
32544 case X86ISD::CVTPS2PH:
32545 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
32546 return;
32547 case ISD::CTPOP: {
32548 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32549 // If we have at most 32 active bits, then perform as i32 CTPOP.
32550 // TODO: Perform this in generic legalizer?
32551 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
32552 unsigned LZ = Known.countMinLeadingZeros();
32553 unsigned TZ = Known.countMinTrailingZeros();
32554 if ((LZ + TZ) >= 32) {
32555 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
32556 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
32557 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
32558 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
32559 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
32560 Results.push_back(Op);
32561 return;
32562 }
32563 // Use a v2i64 if possible.
32564 bool NoImplicitFloatOps =
32566 Attribute::NoImplicitFloat);
32567 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
32568 SDValue Wide =
32569 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
32570 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
32571 // Bit count should fit in 32-bits, extract it as that and then zero
32572 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
32573 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
32574 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
32575 DAG.getIntPtrConstant(0, dl));
32576 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
32577 Results.push_back(Wide);
32578 }
32579 return;
32580 }
32581 case ISD::MUL: {
32582 EVT VT = N->getValueType(0);
32584 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
32585 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
32586 // elements are needed.
32587 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
32588 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
32589 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
32590 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
32591 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32592 unsigned NumConcats = 16 / VT.getVectorNumElements();
32593 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32594 ConcatOps[0] = Res;
32595 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
32596 Results.push_back(Res);
32597 return;
32598 }
32599 case ISD::SMULO:
32600 case ISD::UMULO: {
32601 EVT VT = N->getValueType(0);
32603 VT == MVT::v2i32 && "Unexpected VT!");
32604 bool IsSigned = N->getOpcode() == ISD::SMULO;
32605 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
32606 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
32607 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
32608 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
32609 // Extract the high 32 bits from each result using PSHUFD.
32610 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
32611 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
32612 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
32613 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
32614 DAG.getIntPtrConstant(0, dl));
32615
32616 // Truncate the low bits of the result. This will become PSHUFD.
32617 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32618
32619 SDValue HiCmp;
32620 if (IsSigned) {
32621 // SMULO overflows if the high bits don't match the sign of the low.
32622 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
32623 } else {
32624 // UMULO overflows if the high bits are non-zero.
32625 HiCmp = DAG.getConstant(0, dl, VT);
32626 }
32627 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
32628
32629 // Widen the result with by padding with undef.
32630 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32631 DAG.getUNDEF(VT));
32632 Results.push_back(Res);
32633 Results.push_back(Ovf);
32634 return;
32635 }
32636 case X86ISD::VPMADDWD: {
32637 // Legalize types for X86ISD::VPMADDWD by widening.
32638 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32639
32640 EVT VT = N->getValueType(0);
32641 EVT InVT = N->getOperand(0).getValueType();
32642 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
32643 "Expected a VT that divides into 128 bits.");
32645 "Unexpected type action!");
32646 unsigned NumConcat = 128 / InVT.getSizeInBits();
32647
32648 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
32649 InVT.getVectorElementType(),
32650 NumConcat * InVT.getVectorNumElements());
32651 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
32653 NumConcat * VT.getVectorNumElements());
32654
32655 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
32656 Ops[0] = N->getOperand(0);
32657 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32658 Ops[0] = N->getOperand(1);
32659 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32660
32661 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
32662 Results.push_back(Res);
32663 return;
32664 }
32665 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
32666 case X86ISD::FMINC:
32667 case X86ISD::FMIN:
32668 case X86ISD::FMAXC:
32669 case X86ISD::FMAX: {
32670 EVT VT = N->getValueType(0);
32671 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
32672 SDValue UNDEF = DAG.getUNDEF(VT);
32673 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32674 N->getOperand(0), UNDEF);
32675 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32676 N->getOperand(1), UNDEF);
32677 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
32678 return;
32679 }
32680 case ISD::SDIV:
32681 case ISD::UDIV:
32682 case ISD::SREM:
32683 case ISD::UREM: {
32684 EVT VT = N->getValueType(0);
32685 if (VT.isVector()) {
32687 "Unexpected type action!");
32688 // If this RHS is a constant splat vector we can widen this and let
32689 // division/remainder by constant optimize it.
32690 // TODO: Can we do something for non-splat?
32691 APInt SplatVal;
32692 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
32693 unsigned NumConcats = 128 / VT.getSizeInBits();
32694 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
32695 Ops0[0] = N->getOperand(0);
32696 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
32697 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
32698 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
32699 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
32700 Results.push_back(Res);
32701 }
32702 return;
32703 }
32704
32705 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
32706 Results.push_back(V);
32707 return;
32708 }
32709 case ISD::TRUNCATE: {
32710 MVT VT = N->getSimpleValueType(0);
32711 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
32712 return;
32713
32714 // The generic legalizer will try to widen the input type to the same
32715 // number of elements as the widened result type. But this isn't always
32716 // the best thing so do some custom legalization to avoid some cases.
32717 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
32718 SDValue In = N->getOperand(0);
32719 EVT InVT = In.getValueType();
32720 EVT InEltVT = InVT.getVectorElementType();
32721 EVT EltVT = VT.getVectorElementType();
32722 unsigned MinElts = VT.getVectorNumElements();
32723 unsigned WidenNumElts = WidenVT.getVectorNumElements();
32724 unsigned InBits = InVT.getSizeInBits();
32725
32726 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
32727 unsigned PackOpcode;
32728 if (SDValue Src =
32729 matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
32730 if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
32731 dl, DAG, Subtarget)) {
32732 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
32733 Results.push_back(Res);
32734 return;
32735 }
32736 }
32737
32738 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
32739 // 128 bit and smaller inputs should avoid truncate all together and
32740 // use a shuffle.
32741 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
32742 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
32743 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
32744 for (unsigned I = 0; I < MinElts; ++I)
32745 TruncMask[I] = Scale * I;
32746 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
32747 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
32748 "Illegal vector type in truncation");
32749 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
32750 Results.push_back(
32751 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
32752 return;
32753 }
32754 }
32755
32756 // With AVX512 there are some cases that can use a target specific
32757 // truncate node to go from 256/512 to less than 128 with zeros in the
32758 // upper elements of the 128 bit result.
32759 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
32760 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
32761 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
32762 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32763 return;
32764 }
32765 // There's one case we can widen to 512 bits and use VTRUNC.
32766 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
32767 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
32768 DAG.getUNDEF(MVT::v4i64));
32769 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32770 return;
32771 }
32772 }
32773 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
32774 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
32775 isTypeLegal(MVT::v4i64)) {
32776 // Input needs to be split and output needs to widened. Let's use two
32777 // VTRUNCs, and shuffle their results together into the wider type.
32778 SDValue Lo, Hi;
32779 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
32780
32781 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
32782 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
32783 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
32784 { 0, 1, 2, 3, 16, 17, 18, 19,
32785 -1, -1, -1, -1, -1, -1, -1, -1 });
32786 Results.push_back(Res);
32787 return;
32788 }
32789
32790 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
32791 // this via type legalization.
32792 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
32793 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
32794 (!Subtarget.hasSSSE3() ||
32795 (!isTypeLegal(InVT) &&
32796 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
32797 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
32798 InEltVT.getSizeInBits() * WidenNumElts);
32799 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
32800 return;
32801 }
32802
32803 return;
32804 }
32805 case ISD::ANY_EXTEND:
32806 // Right now, only MVT::v8i8 has Custom action for an illegal type.
32807 // It's intended to custom handle the input type.
32808 assert(N->getValueType(0) == MVT::v8i8 &&
32809 "Do not know how to legalize this Node");
32810 return;
32811 case ISD::SIGN_EXTEND:
32812 case ISD::ZERO_EXTEND: {
32813 EVT VT = N->getValueType(0);
32814 SDValue In = N->getOperand(0);
32815 EVT InVT = In.getValueType();
32816 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
32817 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
32819 "Unexpected type action!");
32820 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
32821 // Custom split this so we can extend i8/i16->i32 invec. This is better
32822 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
32823 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
32824 // we allow the sra from the extend to i32 to be shared by the split.
32825 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
32826
32827 // Fill a vector with sign bits for each element.
32828 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
32829 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
32830
32831 // Create an unpackl and unpackh to interleave the sign bits then bitcast
32832 // to v2i64.
32833 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32834 {0, 4, 1, 5});
32835 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
32836 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32837 {2, 6, 3, 7});
32838 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
32839
32840 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32841 Results.push_back(Res);
32842 return;
32843 }
32844
32845 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
32846 if (!InVT.is128BitVector()) {
32847 // Not a 128 bit vector, but maybe type legalization will promote
32848 // it to 128 bits.
32849 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
32850 return;
32851 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
32852 if (!InVT.is128BitVector())
32853 return;
32854
32855 // Promote the input to 128 bits. Type legalization will turn this into
32856 // zext_inreg/sext_inreg.
32857 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32858 }
32859
32860 // Perform custom splitting instead of the two stage extend we would get
32861 // by default.
32862 EVT LoVT, HiVT;
32863 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32864 assert(isTypeLegal(LoVT) && "Split VT not legal?");
32865
32866 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32867
32868 // We need to shift the input over by half the number of elements.
32869 unsigned NumElts = InVT.getVectorNumElements();
32870 unsigned HalfNumElts = NumElts / 2;
32871 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
32872 for (unsigned i = 0; i != HalfNumElts; ++i)
32873 ShufMask[i] = i + HalfNumElts;
32874
32875 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
32876 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32877
32878 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32879 Results.push_back(Res);
32880 }
32881 return;
32882 }
32883 case ISD::FP_TO_SINT:
32885 case ISD::FP_TO_UINT:
32887 bool IsStrict = N->isStrictFPOpcode();
32888 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32889 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32890 EVT VT = N->getValueType(0);
32891 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32892 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
32893 EVT SrcVT = Src.getValueType();
32894
32895 SDValue Res;
32896 if (isSoftF16(SrcVT, Subtarget)) {
32897 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
32898 if (IsStrict) {
32899 Res =
32900 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
32901 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
32902 {NVT, MVT::Other}, {Chain, Src})});
32903 Chain = Res.getValue(1);
32904 } else {
32905 Res = DAG.getNode(N->getOpcode(), dl, VT,
32906 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
32907 }
32908 Results.push_back(Res);
32909 if (IsStrict)
32910 Results.push_back(Chain);
32911
32912 return;
32913 }
32914
32915 if (VT.isVector() && Subtarget.hasFP16() &&
32916 SrcVT.getVectorElementType() == MVT::f16) {
32917 EVT EleVT = VT.getVectorElementType();
32918 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
32919
32920 if (SrcVT != MVT::v8f16) {
32921 SDValue Tmp =
32922 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
32923 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
32924 Ops[0] = Src;
32925 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
32926 }
32927
32928 if (IsStrict) {
32929 unsigned Opc =
32931 Res =
32932 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32933 Chain = Res.getValue(1);
32934 } else {
32935 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32936 Res = DAG.getNode(Opc, dl, ResVT, Src);
32937 }
32938
32939 // TODO: Need to add exception check code for strict FP.
32940 if (EleVT.getSizeInBits() < 16) {
32941 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
32942 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
32943
32944 // Now widen to 128 bits.
32945 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
32946 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
32947 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
32948 ConcatOps[0] = Res;
32949 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32950 }
32951
32952 Results.push_back(Res);
32953 if (IsStrict)
32954 Results.push_back(Chain);
32955
32956 return;
32957 }
32958
32959 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
32961 "Unexpected type action!");
32962
32963 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
32964 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
32965 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
32967 SDValue Res;
32968 SDValue Chain;
32969 if (IsStrict) {
32970 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
32971 {N->getOperand(0), Src});
32972 Chain = Res.getValue(1);
32973 } else
32974 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
32975
32976 // Preserve what we know about the size of the original result. If the
32977 // result is v2i32, we have to manually widen the assert.
32978 if (PromoteVT == MVT::v2i32)
32979 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32980 DAG.getUNDEF(MVT::v2i32));
32981
32982 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
32983 Res.getValueType(), Res,
32985
32986 if (PromoteVT == MVT::v2i32)
32987 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
32988 DAG.getIntPtrConstant(0, dl));
32989
32990 // Truncate back to the original width.
32991 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32992
32993 // Now widen to 128 bits.
32994 unsigned NumConcats = 128 / VT.getSizeInBits();
32996 VT.getVectorNumElements() * NumConcats);
32997 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32998 ConcatOps[0] = Res;
32999 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
33000 Results.push_back(Res);
33001 if (IsStrict)
33002 Results.push_back(Chain);
33003 return;
33004 }
33005
33006
33007 if (VT == MVT::v2i32) {
33008 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
33009 "Strict unsigned conversion requires AVX512");
33010 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33012 "Unexpected type action!");
33013 if (Src.getValueType() == MVT::v2f64) {
33014 if (!IsSigned && !Subtarget.hasAVX512()) {
33015 SDValue Res =
33016 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
33017 Results.push_back(Res);
33018 return;
33019 }
33020
33021 unsigned Opc;
33022 if (IsStrict)
33024 else
33025 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33026
33027 // If we have VLX we can emit a target specific FP_TO_UINT node,.
33028 if (!IsSigned && !Subtarget.hasVLX()) {
33029 // Otherwise we can defer to the generic legalizer which will widen
33030 // the input as well. This will be further widened during op
33031 // legalization to v8i32<-v8f64.
33032 // For strict nodes we'll need to widen ourselves.
33033 // FIXME: Fix the type legalizer to safely widen strict nodes?
33034 if (!IsStrict)
33035 return;
33036 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
33037 DAG.getConstantFP(0.0, dl, MVT::v2f64));
33038 Opc = N->getOpcode();
33039 }
33040 SDValue Res;
33041 SDValue Chain;
33042 if (IsStrict) {
33043 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
33044 {N->getOperand(0), Src});
33045 Chain = Res.getValue(1);
33046 } else {
33047 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
33048 }
33049 Results.push_back(Res);
33050 if (IsStrict)
33051 Results.push_back(Chain);
33052 return;
33053 }
33054
33055 // Custom widen strict v2f32->v2i32 by padding with zeros.
33056 // FIXME: Should generic type legalizer do this?
33057 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
33058 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
33059 DAG.getConstantFP(0.0, dl, MVT::v2f32));
33060 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
33061 {N->getOperand(0), Src});
33062 Results.push_back(Res);
33063 Results.push_back(Res.getValue(1));
33064 return;
33065 }
33066
33067 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
33068 // so early out here.
33069 return;
33070 }
33071
33072 assert(!VT.isVector() && "Vectors should have been handled above!");
33073
33074 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
33075 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
33076 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
33077 assert(!Subtarget.is64Bit() && "i64 should be legal");
33078 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
33079 // If we use a 128-bit result we might need to use a target specific node.
33080 unsigned SrcElts =
33081 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
33082 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
33083 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
33084 unsigned Opc = N->getOpcode();
33085 if (NumElts != SrcElts) {
33086 if (IsStrict)
33088 else
33089 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33090 }
33091
33092 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
33093 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
33094 DAG.getConstantFP(0.0, dl, VecInVT), Src,
33095 ZeroIdx);
33096 SDValue Chain;
33097 if (IsStrict) {
33098 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
33099 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
33100 Chain = Res.getValue(1);
33101 } else
33102 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
33103 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
33104 Results.push_back(Res);
33105 if (IsStrict)
33106 Results.push_back(Chain);
33107 return;
33108 }
33109
33110 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
33111 SDValue Chain;
33112 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
33113 Results.push_back(V);
33114 if (IsStrict)
33115 Results.push_back(Chain);
33116 return;
33117 }
33118
33119 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
33120 Results.push_back(V);
33121 if (IsStrict)
33122 Results.push_back(Chain);
33123 }
33124 return;
33125 }
33126 case ISD::LRINT:
33127 case ISD::LLRINT: {
33128 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
33129 Results.push_back(V);
33130 return;
33131 }
33132
33133 case ISD::SINT_TO_FP:
33135 case ISD::UINT_TO_FP:
33137 bool IsStrict = N->isStrictFPOpcode();
33138 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
33139 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
33140 EVT VT = N->getValueType(0);
33141 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33142 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
33143 Subtarget.hasVLX()) {
33144 if (Src.getValueType().getVectorElementType() == MVT::i16)
33145 return;
33146
33147 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
33148 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
33149 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
33150 : DAG.getUNDEF(MVT::v2i32));
33151 if (IsStrict) {
33152 unsigned Opc =
33154 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
33155 {N->getOperand(0), Src});
33156 Results.push_back(Res);
33157 Results.push_back(Res.getValue(1));
33158 } else {
33159 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
33160 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
33161 }
33162 return;
33163 }
33164 if (VT != MVT::v2f32)
33165 return;
33166 EVT SrcVT = Src.getValueType();
33167 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
33168 if (IsStrict) {
33169 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
33171 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33172 {N->getOperand(0), Src});
33173 Results.push_back(Res);
33174 Results.push_back(Res.getValue(1));
33175 } else {
33176 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
33177 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
33178 }
33179 return;
33180 }
33181 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
33182 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
33183 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
33184 SDValue One = DAG.getConstant(1, dl, SrcVT);
33185 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
33186 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
33187 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
33188 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
33189 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
33190 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
33191 for (int i = 0; i != 2; ++i) {
33192 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
33193 SignSrc, DAG.getIntPtrConstant(i, dl));
33194 if (IsStrict)
33195 SignCvts[i] =
33196 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
33197 {N->getOperand(0), Elt});
33198 else
33199 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
33200 };
33201 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
33202 SDValue Slow, Chain;
33203 if (IsStrict) {
33204 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33205 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
33206 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
33207 {Chain, SignCvt, SignCvt});
33208 Chain = Slow.getValue(1);
33209 } else {
33210 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
33211 }
33212 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
33213 IsNeg =
33214 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
33215 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
33216 Results.push_back(Cvt);
33217 if (IsStrict)
33218 Results.push_back(Chain);
33219 return;
33220 }
33221
33222 if (SrcVT != MVT::v2i32)
33223 return;
33224
33225 if (IsSigned || Subtarget.hasAVX512()) {
33226 if (!IsStrict)
33227 return;
33228
33229 // Custom widen strict v2i32->v2f32 to avoid scalarization.
33230 // FIXME: Should generic type legalizer do this?
33231 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
33232 DAG.getConstant(0, dl, MVT::v2i32));
33233 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
33234 {N->getOperand(0), Src});
33235 Results.push_back(Res);
33236 Results.push_back(Res.getValue(1));
33237 return;
33238 }
33239
33240 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33241 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
33242 SDValue VBias = DAG.getConstantFP(
33243 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
33244 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
33245 DAG.getBitcast(MVT::v2i64, VBias));
33246 Or = DAG.getBitcast(MVT::v2f64, Or);
33247 if (IsStrict) {
33248 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
33249 {N->getOperand(0), Or, VBias});
33251 {MVT::v4f32, MVT::Other},
33252 {Sub.getValue(1), Sub});
33253 Results.push_back(Res);
33254 Results.push_back(Res.getValue(1));
33255 } else {
33256 // TODO: Are there any fast-math-flags to propagate here?
33257 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
33258 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
33259 }
33260 return;
33261 }
33263 case ISD::FP_ROUND: {
33264 bool IsStrict = N->isStrictFPOpcode();
33265 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33266 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33267 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
33268 EVT SrcVT = Src.getValueType();
33269 EVT VT = N->getValueType(0);
33270 SDValue V;
33271 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
33272 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
33273 : DAG.getUNDEF(MVT::v2f32);
33274 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
33275 }
33276 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
33277 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
33278 if (SrcVT.getVectorElementType() != MVT::f32)
33279 return;
33280
33281 if (IsStrict)
33282 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
33283 {Chain, Src, Rnd});
33284 else
33285 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
33286
33287 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
33288 if (IsStrict)
33289 Results.push_back(V.getValue(1));
33290 return;
33291 }
33292 if (!isTypeLegal(Src.getValueType()))
33293 return;
33294 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
33295 if (IsStrict)
33296 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
33297 {Chain, Src});
33298 else
33299 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
33300 Results.push_back(V);
33301 if (IsStrict)
33302 Results.push_back(V.getValue(1));
33303 return;
33304 }
33305 case ISD::FP_EXTEND:
33306 case ISD::STRICT_FP_EXTEND: {
33307 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
33308 // No other ValueType for FP_EXTEND should reach this point.
33309 assert(N->getValueType(0) == MVT::v2f32 &&
33310 "Do not know how to legalize this Node");
33311 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
33312 return;
33313 bool IsStrict = N->isStrictFPOpcode();
33314 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33315 if (Src.getValueType().getVectorElementType() != MVT::f16)
33316 return;
33317 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
33318 : DAG.getUNDEF(MVT::v2f16);
33319 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
33320 if (IsStrict)
33321 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
33322 {N->getOperand(0), V});
33323 else
33324 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
33325 Results.push_back(V);
33326 if (IsStrict)
33327 Results.push_back(V.getValue(1));
33328 return;
33329 }
33331 unsigned IntNo = N->getConstantOperandVal(1);
33332 switch (IntNo) {
33333 default : llvm_unreachable("Do not know how to custom type "
33334 "legalize this intrinsic operation!");
33335 case Intrinsic::x86_rdtsc:
33336 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
33337 Results);
33338 case Intrinsic::x86_rdtscp:
33339 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
33340 Results);
33341 case Intrinsic::x86_rdpmc:
33342 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
33343 Results);
33344 return;
33345 case Intrinsic::x86_rdpru:
33346 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
33347 Results);
33348 return;
33349 case Intrinsic::x86_xgetbv:
33350 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
33351 Results);
33352 return;
33353 }
33354 }
33355 case ISD::READCYCLECOUNTER: {
33356 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
33357 }
33359 EVT T = N->getValueType(0);
33360 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
33361 bool Regs64bit = T == MVT::i128;
33362 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
33363 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
33364 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
33365 SDValue cpInL, cpInH;
33366 std::tie(cpInL, cpInH) =
33367 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
33368 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
33369 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
33370 cpInH =
33371 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
33372 cpInH, cpInL.getValue(1));
33373 SDValue swapInL, swapInH;
33374 std::tie(swapInL, swapInH) =
33375 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
33376 swapInH =
33377 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
33378 swapInH, cpInH.getValue(1));
33379
33380 // In 64-bit mode we might need the base pointer in RBX, but we can't know
33381 // until later. So we keep the RBX input in a vreg and use a custom
33382 // inserter.
33383 // Since RBX will be a reserved register the register allocator will not
33384 // make sure its value will be properly saved and restored around this
33385 // live-range.
33386 SDValue Result;
33387 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
33388 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
33389 if (Regs64bit) {
33390 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
33391 swapInH.getValue(1)};
33392 Result =
33393 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
33394 } else {
33395 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
33396 swapInH.getValue(1));
33397 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
33398 swapInL.getValue(1)};
33399 Result =
33400 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
33401 }
33402
33403 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
33404 Regs64bit ? X86::RAX : X86::EAX,
33405 HalfT, Result.getValue(1));
33406 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
33407 Regs64bit ? X86::RDX : X86::EDX,
33408 HalfT, cpOutL.getValue(2));
33409 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
33410
33411 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
33412 MVT::i32, cpOutH.getValue(2));
33413 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
33414 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
33415
33416 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
33417 Results.push_back(Success);
33418 Results.push_back(EFLAGS.getValue(1));
33419 return;
33420 }
33421 case ISD::ATOMIC_LOAD: {
33422 assert(
33423 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
33424 "Unexpected VT!");
33425 bool NoImplicitFloatOps =
33427 Attribute::NoImplicitFloat);
33428 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
33429 auto *Node = cast<AtomicSDNode>(N);
33430
33431 if (N->getValueType(0) == MVT::i128) {
33432 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
33433 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
33434 Node->getBasePtr(), Node->getMemOperand());
33435 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33436 DAG.getIntPtrConstant(0, dl));
33437 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33438 DAG.getIntPtrConstant(1, dl));
33439 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
33440 {ResL, ResH}));
33441 Results.push_back(Ld.getValue(1));
33442 return;
33443 }
33444 break;
33445 }
33446 if (Subtarget.hasSSE1()) {
33447 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
33448 // Then extract the lower 64-bits.
33449 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33450 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
33451 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33452 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33453 MVT::i64, Node->getMemOperand());
33454 if (Subtarget.hasSSE2()) {
33455 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33456 DAG.getIntPtrConstant(0, dl));
33457 Results.push_back(Res);
33458 Results.push_back(Ld.getValue(1));
33459 return;
33460 }
33461 // We use an alternative sequence for SSE1 that extracts as v2f32 and
33462 // then casts to i64. This avoids a 128-bit stack temporary being
33463 // created by type legalization if we were to cast v4f32->v2i64.
33464 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
33465 DAG.getIntPtrConstant(0, dl));
33466 Res = DAG.getBitcast(MVT::i64, Res);
33467 Results.push_back(Res);
33468 Results.push_back(Ld.getValue(1));
33469 return;
33470 }
33471 if (Subtarget.hasX87()) {
33472 // First load this into an 80-bit X87 register. This will put the whole
33473 // integer into the significand.
33474 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33475 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33477 dl, Tys, Ops, MVT::i64,
33478 Node->getMemOperand());
33479 SDValue Chain = Result.getValue(1);
33480
33481 // Now store the X87 register to a stack temporary and convert to i64.
33482 // This store is not atomic and doesn't need to be.
33483 // FIXME: We don't need a stack temporary if the result of the load
33484 // is already being stored. We could just directly store there.
33485 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33486 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33487 MachinePointerInfo MPI =
33489 SDValue StoreOps[] = { Chain, Result, StackPtr };
33490 Chain = DAG.getMemIntrinsicNode(
33491 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
33492 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
33493
33494 // Finally load the value back from the stack temporary and return it.
33495 // This load is not atomic and doesn't need to be.
33496 // This load will be further type legalized.
33497 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
33498 Results.push_back(Result);
33499 Results.push_back(Result.getValue(1));
33500 return;
33501 }
33502 }
33503 // TODO: Use MOVLPS when SSE1 is available?
33504 // Delegate to generic TypeLegalization. Situations we can really handle
33505 // should have already been dealt with by AtomicExpandPass.cpp.
33506 break;
33507 }
33508 case ISD::ATOMIC_SWAP:
33519 // Delegate to generic TypeLegalization. Situations we can really handle
33520 // should have already been dealt with by AtomicExpandPass.cpp.
33521 break;
33522
33523 case ISD::BITCAST: {
33524 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33525 EVT DstVT = N->getValueType(0);
33526 EVT SrcVT = N->getOperand(0).getValueType();
33527
33528 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
33529 // we can split using the k-register rather than memory.
33530 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
33531 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
33532 SDValue Lo, Hi;
33533 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33534 Lo = DAG.getBitcast(MVT::i32, Lo);
33535 Hi = DAG.getBitcast(MVT::i32, Hi);
33536 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
33537 Results.push_back(Res);
33538 return;
33539 }
33540
33541 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
33542 // FIXME: Use v4f32 for SSE1?
33543 assert(Subtarget.hasSSE2() && "Requires SSE2");
33544 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
33545 "Unexpected type action!");
33546 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
33547 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
33548 N->getOperand(0));
33549 Res = DAG.getBitcast(WideVT, Res);
33550 Results.push_back(Res);
33551 return;
33552 }
33553
33554 return;
33555 }
33556 case ISD::MGATHER: {
33557 EVT VT = N->getValueType(0);
33558 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
33559 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
33560 auto *Gather = cast<MaskedGatherSDNode>(N);
33561 SDValue Index = Gather->getIndex();
33562 if (Index.getValueType() != MVT::v2i64)
33563 return;
33565 "Unexpected type action!");
33566 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33567 SDValue Mask = Gather->getMask();
33568 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33569 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
33570 Gather->getPassThru(),
33571 DAG.getUNDEF(VT));
33572 if (!Subtarget.hasVLX()) {
33573 // We need to widen the mask, but the instruction will only use 2
33574 // of its elements. So we can use undef.
33575 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
33576 DAG.getUNDEF(MVT::v2i1));
33577 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
33578 }
33579 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
33580 Gather->getBasePtr(), Index, Gather->getScale() };
33581 SDValue Res = DAG.getMemIntrinsicNode(
33582 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
33583 Gather->getMemoryVT(), Gather->getMemOperand());
33584 Results.push_back(Res);
33585 Results.push_back(Res.getValue(1));
33586 return;
33587 }
33588 return;
33589 }
33590 case ISD::LOAD: {
33591 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
33592 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
33593 // cast since type legalization will try to use an i64 load.
33594 MVT VT = N->getSimpleValueType(0);
33595 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
33597 "Unexpected type action!");
33598 if (!ISD::isNON_EXTLoad(N))
33599 return;
33600 auto *Ld = cast<LoadSDNode>(N);
33601 if (Subtarget.hasSSE2()) {
33602 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
33603 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
33604 Ld->getPointerInfo(), Ld->getOriginalAlign(),
33605 Ld->getMemOperand()->getFlags());
33606 SDValue Chain = Res.getValue(1);
33607 MVT VecVT = MVT::getVectorVT(LdVT, 2);
33608 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
33609 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33610 Res = DAG.getBitcast(WideVT, Res);
33611 Results.push_back(Res);
33612 Results.push_back(Chain);
33613 return;
33614 }
33615 assert(Subtarget.hasSSE1() && "Expected SSE");
33616 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
33617 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
33618 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33619 MVT::i64, Ld->getMemOperand());
33620 Results.push_back(Res);
33621 Results.push_back(Res.getValue(1));
33622 return;
33623 }
33624 case ISD::ADDRSPACECAST: {
33625 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
33626 Results.push_back(V);
33627 return;
33628 }
33629 case ISD::BITREVERSE: {
33630 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33631 assert(Subtarget.hasXOP() && "Expected XOP");
33632 // We can use VPPERM by copying to a vector register and back. We'll need
33633 // to move the scalar in two i32 pieces.
33634 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
33635 return;
33636 }
33638 // f16 = extract vXf16 %vec, i64 %idx
33639 assert(N->getSimpleValueType(0) == MVT::f16 &&
33640 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
33641 assert(Subtarget.hasFP16() && "Expected FP16");
33642 SDValue VecOp = N->getOperand(0);
33644 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
33645 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
33646 N->getOperand(1));
33647 Split = DAG.getBitcast(MVT::f16, Split);
33648 Results.push_back(Split);
33649 return;
33650 }
33651 }
33652}
33653
33654const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
33655 switch ((X86ISD::NodeType)Opcode) {
33656 case X86ISD::FIRST_NUMBER: break;
33657#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
33658 NODE_NAME_CASE(BSF)
33659 NODE_NAME_CASE(BSR)
33660 NODE_NAME_CASE(FSHL)
33661 NODE_NAME_CASE(FSHR)
33662 NODE_NAME_CASE(FAND)
33663 NODE_NAME_CASE(FANDN)
33664 NODE_NAME_CASE(FOR)
33665 NODE_NAME_CASE(FXOR)
33666 NODE_NAME_CASE(FILD)
33667 NODE_NAME_CASE(FIST)
33668 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
33669 NODE_NAME_CASE(FLD)
33670 NODE_NAME_CASE(FST)
33671 NODE_NAME_CASE(CALL)
33672 NODE_NAME_CASE(CALL_RVMARKER)
33674 NODE_NAME_CASE(CMP)
33675 NODE_NAME_CASE(FCMP)
33676 NODE_NAME_CASE(STRICT_FCMP)
33677 NODE_NAME_CASE(STRICT_FCMPS)
33679 NODE_NAME_CASE(UCOMI)
33680 NODE_NAME_CASE(CMPM)
33681 NODE_NAME_CASE(CMPMM)
33682 NODE_NAME_CASE(STRICT_CMPM)
33683 NODE_NAME_CASE(CMPMM_SAE)
33684 NODE_NAME_CASE(SETCC)
33685 NODE_NAME_CASE(SETCC_CARRY)
33686 NODE_NAME_CASE(FSETCC)
33687 NODE_NAME_CASE(FSETCCM)
33688 NODE_NAME_CASE(FSETCCM_SAE)
33689 NODE_NAME_CASE(CMOV)
33690 NODE_NAME_CASE(BRCOND)
33691 NODE_NAME_CASE(RET_GLUE)
33692 NODE_NAME_CASE(IRET)
33693 NODE_NAME_CASE(REP_STOS)
33694 NODE_NAME_CASE(REP_MOVS)
33695 NODE_NAME_CASE(GlobalBaseReg)
33697 NODE_NAME_CASE(WrapperRIP)
33698 NODE_NAME_CASE(MOVQ2DQ)
33699 NODE_NAME_CASE(MOVDQ2Q)
33700 NODE_NAME_CASE(MMX_MOVD2W)
33701 NODE_NAME_CASE(MMX_MOVW2D)
33702 NODE_NAME_CASE(PEXTRB)
33703 NODE_NAME_CASE(PEXTRW)
33704 NODE_NAME_CASE(INSERTPS)
33705 NODE_NAME_CASE(PINSRB)
33706 NODE_NAME_CASE(PINSRW)
33707 NODE_NAME_CASE(PSHUFB)
33708 NODE_NAME_CASE(ANDNP)
33709 NODE_NAME_CASE(BLENDI)
33711 NODE_NAME_CASE(HADD)
33712 NODE_NAME_CASE(HSUB)
33713 NODE_NAME_CASE(FHADD)
33714 NODE_NAME_CASE(FHSUB)
33715 NODE_NAME_CASE(CONFLICT)
33716 NODE_NAME_CASE(FMAX)
33717 NODE_NAME_CASE(FMAXS)
33718 NODE_NAME_CASE(FMAX_SAE)
33719 NODE_NAME_CASE(FMAXS_SAE)
33720 NODE_NAME_CASE(FMIN)
33721 NODE_NAME_CASE(FMINS)
33722 NODE_NAME_CASE(FMIN_SAE)
33723 NODE_NAME_CASE(FMINS_SAE)
33724 NODE_NAME_CASE(FMAXC)
33725 NODE_NAME_CASE(FMINC)
33726 NODE_NAME_CASE(FRSQRT)
33727 NODE_NAME_CASE(FRCP)
33728 NODE_NAME_CASE(EXTRQI)
33729 NODE_NAME_CASE(INSERTQI)
33730 NODE_NAME_CASE(TLSADDR)
33731 NODE_NAME_CASE(TLSBASEADDR)
33732 NODE_NAME_CASE(TLSCALL)
33733 NODE_NAME_CASE(TLSDESC)
33734 NODE_NAME_CASE(EH_SJLJ_SETJMP)
33735 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
33736 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
33737 NODE_NAME_CASE(EH_RETURN)
33738 NODE_NAME_CASE(TC_RETURN)
33739 NODE_NAME_CASE(FNSTCW16m)
33740 NODE_NAME_CASE(FLDCW16m)
33741 NODE_NAME_CASE(FNSTENVm)
33742 NODE_NAME_CASE(FLDENVm)
33743 NODE_NAME_CASE(LCMPXCHG_DAG)
33744 NODE_NAME_CASE(LCMPXCHG8_DAG)
33745 NODE_NAME_CASE(LCMPXCHG16_DAG)
33746 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
33747 NODE_NAME_CASE(LADD)
33748 NODE_NAME_CASE(LSUB)
33749 NODE_NAME_CASE(LOR)
33750 NODE_NAME_CASE(LXOR)
33751 NODE_NAME_CASE(LAND)
33752 NODE_NAME_CASE(LBTS)
33753 NODE_NAME_CASE(LBTC)
33754 NODE_NAME_CASE(LBTR)
33755 NODE_NAME_CASE(LBTS_RM)
33756 NODE_NAME_CASE(LBTC_RM)
33757 NODE_NAME_CASE(LBTR_RM)
33758 NODE_NAME_CASE(AADD)
33759 NODE_NAME_CASE(AOR)
33760 NODE_NAME_CASE(AXOR)
33761 NODE_NAME_CASE(AAND)
33762 NODE_NAME_CASE(VZEXT_MOVL)
33763 NODE_NAME_CASE(VZEXT_LOAD)
33764 NODE_NAME_CASE(VEXTRACT_STORE)
33765 NODE_NAME_CASE(VTRUNC)
33766 NODE_NAME_CASE(VTRUNCS)
33767 NODE_NAME_CASE(VTRUNCUS)
33768 NODE_NAME_CASE(VMTRUNC)
33769 NODE_NAME_CASE(VMTRUNCS)
33770 NODE_NAME_CASE(VMTRUNCUS)
33771 NODE_NAME_CASE(VTRUNCSTORES)
33772 NODE_NAME_CASE(VTRUNCSTOREUS)
33773 NODE_NAME_CASE(VMTRUNCSTORES)
33774 NODE_NAME_CASE(VMTRUNCSTOREUS)
33775 NODE_NAME_CASE(VFPEXT)
33776 NODE_NAME_CASE(STRICT_VFPEXT)
33777 NODE_NAME_CASE(VFPEXT_SAE)
33778 NODE_NAME_CASE(VFPEXTS)
33779 NODE_NAME_CASE(VFPEXTS_SAE)
33780 NODE_NAME_CASE(VFPROUND)
33781 NODE_NAME_CASE(STRICT_VFPROUND)
33782 NODE_NAME_CASE(VMFPROUND)
33783 NODE_NAME_CASE(VFPROUND_RND)
33784 NODE_NAME_CASE(VFPROUNDS)
33785 NODE_NAME_CASE(VFPROUNDS_RND)
33786 NODE_NAME_CASE(VSHLDQ)
33787 NODE_NAME_CASE(VSRLDQ)
33788 NODE_NAME_CASE(VSHL)
33789 NODE_NAME_CASE(VSRL)
33790 NODE_NAME_CASE(VSRA)
33791 NODE_NAME_CASE(VSHLI)
33792 NODE_NAME_CASE(VSRLI)
33793 NODE_NAME_CASE(VSRAI)
33794 NODE_NAME_CASE(VSHLV)
33795 NODE_NAME_CASE(VSRLV)
33796 NODE_NAME_CASE(VSRAV)
33797 NODE_NAME_CASE(VROTLI)
33798 NODE_NAME_CASE(VROTRI)
33799 NODE_NAME_CASE(VPPERM)
33800 NODE_NAME_CASE(CMPP)
33801 NODE_NAME_CASE(STRICT_CMPP)
33802 NODE_NAME_CASE(PCMPEQ)
33803 NODE_NAME_CASE(PCMPGT)
33804 NODE_NAME_CASE(PHMINPOS)
33805 NODE_NAME_CASE(ADD)
33806 NODE_NAME_CASE(SUB)
33807 NODE_NAME_CASE(ADC)
33808 NODE_NAME_CASE(SBB)
33809 NODE_NAME_CASE(SMUL)
33810 NODE_NAME_CASE(UMUL)
33811 NODE_NAME_CASE(OR)
33812 NODE_NAME_CASE(XOR)
33813 NODE_NAME_CASE(AND)
33814 NODE_NAME_CASE(BEXTR)
33816 NODE_NAME_CASE(BZHI)
33817 NODE_NAME_CASE(PDEP)
33818 NODE_NAME_CASE(PEXT)
33819 NODE_NAME_CASE(MUL_IMM)
33820 NODE_NAME_CASE(MOVMSK)
33821 NODE_NAME_CASE(PTEST)
33822 NODE_NAME_CASE(TESTP)
33823 NODE_NAME_CASE(KORTEST)
33824 NODE_NAME_CASE(KTEST)
33825 NODE_NAME_CASE(KADD)
33826 NODE_NAME_CASE(KSHIFTL)
33827 NODE_NAME_CASE(KSHIFTR)
33828 NODE_NAME_CASE(PACKSS)
33829 NODE_NAME_CASE(PACKUS)
33830 NODE_NAME_CASE(PALIGNR)
33831 NODE_NAME_CASE(VALIGN)
33832 NODE_NAME_CASE(VSHLD)
33833 NODE_NAME_CASE(VSHRD)
33834 NODE_NAME_CASE(VSHLDV)
33835 NODE_NAME_CASE(VSHRDV)
33836 NODE_NAME_CASE(PSHUFD)
33837 NODE_NAME_CASE(PSHUFHW)
33838 NODE_NAME_CASE(PSHUFLW)
33839 NODE_NAME_CASE(SHUFP)
33840 NODE_NAME_CASE(SHUF128)
33841 NODE_NAME_CASE(MOVLHPS)
33842 NODE_NAME_CASE(MOVHLPS)
33843 NODE_NAME_CASE(MOVDDUP)
33844 NODE_NAME_CASE(MOVSHDUP)
33845 NODE_NAME_CASE(MOVSLDUP)
33846 NODE_NAME_CASE(MOVSD)
33847 NODE_NAME_CASE(MOVSS)
33848 NODE_NAME_CASE(MOVSH)
33849 NODE_NAME_CASE(UNPCKL)
33850 NODE_NAME_CASE(UNPCKH)
33851 NODE_NAME_CASE(VBROADCAST)
33852 NODE_NAME_CASE(VBROADCAST_LOAD)
33853 NODE_NAME_CASE(VBROADCASTM)
33854 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
33855 NODE_NAME_CASE(VPERMILPV)
33856 NODE_NAME_CASE(VPERMILPI)
33857 NODE_NAME_CASE(VPERM2X128)
33858 NODE_NAME_CASE(VPERMV)
33859 NODE_NAME_CASE(VPERMV3)
33860 NODE_NAME_CASE(VPERMI)
33861 NODE_NAME_CASE(VPTERNLOG)
33862 NODE_NAME_CASE(VFIXUPIMM)
33863 NODE_NAME_CASE(VFIXUPIMM_SAE)
33864 NODE_NAME_CASE(VFIXUPIMMS)
33865 NODE_NAME_CASE(VFIXUPIMMS_SAE)
33866 NODE_NAME_CASE(VRANGE)
33867 NODE_NAME_CASE(VRANGE_SAE)
33868 NODE_NAME_CASE(VRANGES)
33869 NODE_NAME_CASE(VRANGES_SAE)
33870 NODE_NAME_CASE(PMULUDQ)
33871 NODE_NAME_CASE(PMULDQ)
33872 NODE_NAME_CASE(PSADBW)
33873 NODE_NAME_CASE(DBPSADBW)
33874 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
33875 NODE_NAME_CASE(VAARG_64)
33876 NODE_NAME_CASE(VAARG_X32)
33877 NODE_NAME_CASE(DYN_ALLOCA)
33878 NODE_NAME_CASE(MFENCE)
33879 NODE_NAME_CASE(SEG_ALLOCA)
33880 NODE_NAME_CASE(PROBED_ALLOCA)
33883 NODE_NAME_CASE(RDPKRU)
33884 NODE_NAME_CASE(WRPKRU)
33885 NODE_NAME_CASE(VPMADDUBSW)
33886 NODE_NAME_CASE(VPMADDWD)
33887 NODE_NAME_CASE(VPSHA)
33888 NODE_NAME_CASE(VPSHL)
33889 NODE_NAME_CASE(VPCOM)
33890 NODE_NAME_CASE(VPCOMU)
33891 NODE_NAME_CASE(VPERMIL2)
33893 NODE_NAME_CASE(STRICT_FMSUB)
33895 NODE_NAME_CASE(STRICT_FNMADD)
33897 NODE_NAME_CASE(STRICT_FNMSUB)
33898 NODE_NAME_CASE(FMADDSUB)
33899 NODE_NAME_CASE(FMSUBADD)
33900 NODE_NAME_CASE(FMADD_RND)
33901 NODE_NAME_CASE(FNMADD_RND)
33902 NODE_NAME_CASE(FMSUB_RND)
33903 NODE_NAME_CASE(FNMSUB_RND)
33904 NODE_NAME_CASE(FMADDSUB_RND)
33905 NODE_NAME_CASE(FMSUBADD_RND)
33906 NODE_NAME_CASE(VFMADDC)
33907 NODE_NAME_CASE(VFMADDC_RND)
33908 NODE_NAME_CASE(VFCMADDC)
33909 NODE_NAME_CASE(VFCMADDC_RND)
33910 NODE_NAME_CASE(VFMULC)
33911 NODE_NAME_CASE(VFMULC_RND)
33912 NODE_NAME_CASE(VFCMULC)
33913 NODE_NAME_CASE(VFCMULC_RND)
33914 NODE_NAME_CASE(VFMULCSH)
33915 NODE_NAME_CASE(VFMULCSH_RND)
33916 NODE_NAME_CASE(VFCMULCSH)
33917 NODE_NAME_CASE(VFCMULCSH_RND)
33918 NODE_NAME_CASE(VFMADDCSH)
33919 NODE_NAME_CASE(VFMADDCSH_RND)
33920 NODE_NAME_CASE(VFCMADDCSH)
33921 NODE_NAME_CASE(VFCMADDCSH_RND)
33922 NODE_NAME_CASE(VPMADD52H)
33923 NODE_NAME_CASE(VPMADD52L)
33924 NODE_NAME_CASE(VRNDSCALE)
33925 NODE_NAME_CASE(STRICT_VRNDSCALE)
33926 NODE_NAME_CASE(VRNDSCALE_SAE)
33927 NODE_NAME_CASE(VRNDSCALES)
33928 NODE_NAME_CASE(VRNDSCALES_SAE)
33929 NODE_NAME_CASE(VREDUCE)
33930 NODE_NAME_CASE(VREDUCE_SAE)
33931 NODE_NAME_CASE(VREDUCES)
33932 NODE_NAME_CASE(VREDUCES_SAE)
33933 NODE_NAME_CASE(VGETMANT)
33934 NODE_NAME_CASE(VGETMANT_SAE)
33935 NODE_NAME_CASE(VGETMANTS)
33936 NODE_NAME_CASE(VGETMANTS_SAE)
33937 NODE_NAME_CASE(PCMPESTR)
33938 NODE_NAME_CASE(PCMPISTR)
33940 NODE_NAME_CASE(COMPRESS)
33942 NODE_NAME_CASE(SELECTS)
33943 NODE_NAME_CASE(ADDSUB)
33944 NODE_NAME_CASE(RCP14)
33945 NODE_NAME_CASE(RCP14S)
33946 NODE_NAME_CASE(RSQRT14)
33947 NODE_NAME_CASE(RSQRT14S)
33948 NODE_NAME_CASE(FADD_RND)
33949 NODE_NAME_CASE(FADDS)
33950 NODE_NAME_CASE(FADDS_RND)
33951 NODE_NAME_CASE(FSUB_RND)
33952 NODE_NAME_CASE(FSUBS)
33953 NODE_NAME_CASE(FSUBS_RND)
33954 NODE_NAME_CASE(FMUL_RND)
33955 NODE_NAME_CASE(FMULS)
33956 NODE_NAME_CASE(FMULS_RND)
33957 NODE_NAME_CASE(FDIV_RND)
33958 NODE_NAME_CASE(FDIVS)
33959 NODE_NAME_CASE(FDIVS_RND)
33960 NODE_NAME_CASE(FSQRT_RND)
33961 NODE_NAME_CASE(FSQRTS)
33962 NODE_NAME_CASE(FSQRTS_RND)
33963 NODE_NAME_CASE(FGETEXP)
33964 NODE_NAME_CASE(FGETEXP_SAE)
33965 NODE_NAME_CASE(FGETEXPS)
33966 NODE_NAME_CASE(FGETEXPS_SAE)
33967 NODE_NAME_CASE(SCALEF)
33968 NODE_NAME_CASE(SCALEF_RND)
33969 NODE_NAME_CASE(SCALEFS)
33970 NODE_NAME_CASE(SCALEFS_RND)
33971 NODE_NAME_CASE(MULHRS)
33972 NODE_NAME_CASE(SINT_TO_FP_RND)
33973 NODE_NAME_CASE(UINT_TO_FP_RND)
33974 NODE_NAME_CASE(CVTTP2SI)
33975 NODE_NAME_CASE(CVTTP2UI)
33976 NODE_NAME_CASE(STRICT_CVTTP2SI)
33977 NODE_NAME_CASE(STRICT_CVTTP2UI)
33978 NODE_NAME_CASE(MCVTTP2SI)
33979 NODE_NAME_CASE(MCVTTP2UI)
33980 NODE_NAME_CASE(CVTTP2SI_SAE)
33981 NODE_NAME_CASE(CVTTP2UI_SAE)
33982 NODE_NAME_CASE(CVTTS2SI)
33983 NODE_NAME_CASE(CVTTS2UI)
33984 NODE_NAME_CASE(CVTTS2SI_SAE)
33985 NODE_NAME_CASE(CVTTS2UI_SAE)
33986 NODE_NAME_CASE(CVTSI2P)
33987 NODE_NAME_CASE(CVTUI2P)
33988 NODE_NAME_CASE(STRICT_CVTSI2P)
33989 NODE_NAME_CASE(STRICT_CVTUI2P)
33990 NODE_NAME_CASE(MCVTSI2P)
33991 NODE_NAME_CASE(MCVTUI2P)
33992 NODE_NAME_CASE(VFPCLASS)
33993 NODE_NAME_CASE(VFPCLASSS)
33994 NODE_NAME_CASE(MULTISHIFT)
33995 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
33996 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
33997 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
33998 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
33999 NODE_NAME_CASE(CVTPS2PH)
34000 NODE_NAME_CASE(STRICT_CVTPS2PH)
34001 NODE_NAME_CASE(CVTPS2PH_SAE)
34002 NODE_NAME_CASE(MCVTPS2PH)
34003 NODE_NAME_CASE(MCVTPS2PH_SAE)
34004 NODE_NAME_CASE(CVTPH2PS)
34005 NODE_NAME_CASE(STRICT_CVTPH2PS)
34006 NODE_NAME_CASE(CVTPH2PS_SAE)
34007 NODE_NAME_CASE(CVTP2SI)
34008 NODE_NAME_CASE(CVTP2UI)
34009 NODE_NAME_CASE(MCVTP2SI)
34010 NODE_NAME_CASE(MCVTP2UI)
34011 NODE_NAME_CASE(CVTP2SI_RND)
34012 NODE_NAME_CASE(CVTP2UI_RND)
34013 NODE_NAME_CASE(CVTS2SI)
34014 NODE_NAME_CASE(CVTS2UI)
34015 NODE_NAME_CASE(CVTS2SI_RND)
34016 NODE_NAME_CASE(CVTS2UI_RND)
34017 NODE_NAME_CASE(CVTNE2PS2BF16)
34018 NODE_NAME_CASE(CVTNEPS2BF16)
34019 NODE_NAME_CASE(MCVTNEPS2BF16)
34020 NODE_NAME_CASE(DPBF16PS)
34021 NODE_NAME_CASE(LWPINS)
34022 NODE_NAME_CASE(MGATHER)
34023 NODE_NAME_CASE(MSCATTER)
34024 NODE_NAME_CASE(VPDPBUSD)
34025 NODE_NAME_CASE(VPDPBUSDS)
34026 NODE_NAME_CASE(VPDPWSSD)
34027 NODE_NAME_CASE(VPDPWSSDS)
34028 NODE_NAME_CASE(VPSHUFBITQMB)
34029 NODE_NAME_CASE(GF2P8MULB)
34030 NODE_NAME_CASE(GF2P8AFFINEQB)
34031 NODE_NAME_CASE(GF2P8AFFINEINVQB)
34032 NODE_NAME_CASE(NT_CALL)
34033 NODE_NAME_CASE(NT_BRIND)
34034 NODE_NAME_CASE(UMWAIT)
34035 NODE_NAME_CASE(TPAUSE)
34036 NODE_NAME_CASE(ENQCMD)
34037 NODE_NAME_CASE(ENQCMDS)
34038 NODE_NAME_CASE(VP2INTERSECT)
34039 NODE_NAME_CASE(VPDPBSUD)
34040 NODE_NAME_CASE(VPDPBSUDS)
34041 NODE_NAME_CASE(VPDPBUUD)
34042 NODE_NAME_CASE(VPDPBUUDS)
34043 NODE_NAME_CASE(VPDPBSSD)
34044 NODE_NAME_CASE(VPDPBSSDS)
34045 NODE_NAME_CASE(AESENC128KL)
34046 NODE_NAME_CASE(AESDEC128KL)
34047 NODE_NAME_CASE(AESENC256KL)
34048 NODE_NAME_CASE(AESDEC256KL)
34049 NODE_NAME_CASE(AESENCWIDE128KL)
34050 NODE_NAME_CASE(AESDECWIDE128KL)
34051 NODE_NAME_CASE(AESENCWIDE256KL)
34052 NODE_NAME_CASE(AESDECWIDE256KL)
34053 NODE_NAME_CASE(CMPCCXADD)
34054 NODE_NAME_CASE(TESTUI)
34055 NODE_NAME_CASE(FP80_ADD)
34056 NODE_NAME_CASE(STRICT_FP80_ADD)
34057 NODE_NAME_CASE(CCMP)
34058 NODE_NAME_CASE(CTEST)
34059 NODE_NAME_CASE(CLOAD)
34060 NODE_NAME_CASE(CSTORE)
34061 }
34062 return nullptr;
34063#undef NODE_NAME_CASE
34064}
34065
34066/// Return true if the addressing mode represented by AM is legal for this
34067/// target, for a load/store of the specified type.
34069 const AddrMode &AM, Type *Ty,
34070 unsigned AS,
34071 Instruction *I) const {
34072 // X86 supports extremely general addressing modes.
34074
34075 // X86 allows a sign-extended 32-bit immediate field as a displacement.
34076 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
34077 return false;
34078
34079 if (AM.BaseGV) {
34080 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
34081
34082 // If a reference to this global requires an extra load, we can't fold it.
34083 if (isGlobalStubReference(GVFlags))
34084 return false;
34085
34086 // If BaseGV requires a register for the PIC base, we cannot also have a
34087 // BaseReg specified.
34088 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
34089 return false;
34090
34091 // If lower 4G is not available, then we must use rip-relative addressing.
34092 if ((M != CodeModel::Small || isPositionIndependent()) &&
34093 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
34094 return false;
34095 }
34096
34097 switch (AM.Scale) {
34098 case 0:
34099 case 1:
34100 case 2:
34101 case 4:
34102 case 8:
34103 // These scales always work.
34104 break;
34105 case 3:
34106 case 5:
34107 case 9:
34108 // These scales are formed with basereg+scalereg. Only accept if there is
34109 // no basereg yet.
34110 if (AM.HasBaseReg)
34111 return false;
34112 break;
34113 default: // Other stuff never works.
34114 return false;
34115 }
34116
34117 return true;
34118}
34119
34121 unsigned Bits = Ty->getScalarSizeInBits();
34122
34123 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
34124 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
34125 if (Subtarget.hasXOP() &&
34126 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
34127 return false;
34128
34129 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
34130 // shifts just as cheap as scalar ones.
34131 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
34132 return false;
34133
34134 // AVX512BW has shifts such as vpsllvw.
34135 if (Subtarget.hasBWI() && Bits == 16)
34136 return false;
34137
34138 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
34139 // fully general vector.
34140 return true;
34141}
34142
34143bool X86TargetLowering::isBinOp(unsigned Opcode) const {
34144 switch (Opcode) {
34145 // These are non-commutative binops.
34146 // TODO: Add more X86ISD opcodes once we have test coverage.
34147 case X86ISD::ANDNP:
34148 case X86ISD::PCMPGT:
34149 case X86ISD::FMAX:
34150 case X86ISD::FMIN:
34151 case X86ISD::FANDN:
34152 case X86ISD::VPSHA:
34153 case X86ISD::VPSHL:
34154 case X86ISD::VSHLV:
34155 case X86ISD::VSRLV:
34156 case X86ISD::VSRAV:
34157 return true;
34158 }
34159
34160 return TargetLoweringBase::isBinOp(Opcode);
34161}
34162
34163bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
34164 switch (Opcode) {
34165 // TODO: Add more X86ISD opcodes once we have test coverage.
34166 case X86ISD::PCMPEQ:
34167 case X86ISD::PMULDQ:
34168 case X86ISD::PMULUDQ:
34169 case X86ISD::FMAXC:
34170 case X86ISD::FMINC:
34171 case X86ISD::FAND:
34172 case X86ISD::FOR:
34173 case X86ISD::FXOR:
34174 return true;
34175 }
34176
34178}
34179
34181 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34182 return false;
34183 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
34184 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
34185 return NumBits1 > NumBits2;
34186}
34187
34189 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34190 return false;
34191
34192 if (!isTypeLegal(EVT::getEVT(Ty1)))
34193 return false;
34194
34195 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
34196
34197 // Assuming the caller doesn't have a zeroext or signext return parameter,
34198 // truncation all the way down to i1 is valid.
34199 return true;
34200}
34201
34203 return isInt<32>(Imm);
34204}
34205
34207 // Can also use sub to handle negated immediates.
34208 return isInt<32>(Imm);
34209}
34210
34212 return isInt<32>(Imm);
34213}
34214
34216 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
34217 return false;
34218 unsigned NumBits1 = VT1.getSizeInBits();
34219 unsigned NumBits2 = VT2.getSizeInBits();
34220 return NumBits1 > NumBits2;
34221}
34222
34224 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34225 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
34226}
34227
34229 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34230 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
34231}
34232
34234 EVT VT1 = Val.getValueType();
34235 if (isZExtFree(VT1, VT2))
34236 return true;
34237
34238 if (Val.getOpcode() != ISD::LOAD)
34239 return false;
34240
34241 if (!VT1.isSimple() || !VT1.isInteger() ||
34242 !VT2.isSimple() || !VT2.isInteger())
34243 return false;
34244
34245 switch (VT1.getSimpleVT().SimpleTy) {
34246 default: break;
34247 case MVT::i8:
34248 case MVT::i16:
34249 case MVT::i32:
34250 // X86 has 8, 16, and 32-bit zero-extending loads.
34251 return true;
34252 }
34253
34254 return false;
34255}
34256
34258 SmallVectorImpl<Use *> &Ops) const {
34259 using namespace llvm::PatternMatch;
34260
34261 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
34262 if (!VTy)
34263 return false;
34264
34265 if (I->getOpcode() == Instruction::Mul &&
34266 VTy->getElementType()->isIntegerTy(64)) {
34267 for (auto &Op : I->operands()) {
34268 // Make sure we are not already sinking this operand
34269 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
34270 continue;
34271
34272 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
34273 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
34274 if (Subtarget.hasSSE41() &&
34275 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
34276 m_SpecificInt(32)))) {
34277 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
34278 Ops.push_back(&Op);
34279 } else if (Subtarget.hasSSE2() &&
34280 match(Op.get(),
34281 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
34282 Ops.push_back(&Op);
34283 }
34284 }
34285
34286 return !Ops.empty();
34287 }
34288
34289 // A uniform shift amount in a vector shift or funnel shift may be much
34290 // cheaper than a generic variable vector shift, so make that pattern visible
34291 // to SDAG by sinking the shuffle instruction next to the shift.
34292 int ShiftAmountOpNum = -1;
34293 if (I->isShift())
34294 ShiftAmountOpNum = 1;
34295 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
34296 if (II->getIntrinsicID() == Intrinsic::fshl ||
34297 II->getIntrinsicID() == Intrinsic::fshr)
34298 ShiftAmountOpNum = 2;
34299 }
34300
34301 if (ShiftAmountOpNum == -1)
34302 return false;
34303
34304 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
34305 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
34306 isVectorShiftByScalarCheap(I->getType())) {
34307 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
34308 return true;
34309 }
34310
34311 return false;
34312}
34313
34315 if (!Subtarget.is64Bit())
34316 return false;
34318}
34319
34321 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
34322 return false;
34323
34324 EVT SrcVT = ExtVal.getOperand(0).getValueType();
34325
34326 // There is no extending load for vXi1.
34327 if (SrcVT.getScalarType() == MVT::i1)
34328 return false;
34329
34330 return true;
34331}
34332
34334 EVT VT) const {
34335 if (!Subtarget.hasAnyFMA())
34336 return false;
34337
34338 VT = VT.getScalarType();
34339
34340 if (!VT.isSimple())
34341 return false;
34342
34343 switch (VT.getSimpleVT().SimpleTy) {
34344 case MVT::f16:
34345 return Subtarget.hasFP16();
34346 case MVT::f32:
34347 case MVT::f64:
34348 return true;
34349 default:
34350 break;
34351 }
34352
34353 return false;
34354}
34355
34357 // i16 instructions are longer (0x66 prefix) and potentially slower.
34358 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
34359}
34360
34362 EVT VT) const {
34363 // TODO: This is too general. There are cases where pre-AVX512 codegen would
34364 // benefit. The transform may also be profitable for scalar code.
34365 if (!Subtarget.hasAVX512())
34366 return false;
34367 if (!Subtarget.hasVLX() && !VT.is512BitVector())
34368 return false;
34369 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
34370 return false;
34371
34372 return true;
34373}
34374
34375/// Targets can use this to indicate that they only support *some*
34376/// VECTOR_SHUFFLE operations, those with specific masks.
34377/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
34378/// are assumed to be legal.
34380 if (!VT.isSimple())
34381 return false;
34382
34383 // Not for i1 vectors
34384 if (VT.getSimpleVT().getScalarType() == MVT::i1)
34385 return false;
34386
34387 // Very little shuffling can be done for 64-bit vectors right now.
34388 if (VT.getSimpleVT().getSizeInBits() == 64)
34389 return false;
34390
34391 // We only care that the types being shuffled are legal. The lowering can
34392 // handle any possible shuffle mask that results.
34393 return isTypeLegal(VT.getSimpleVT());
34394}
34395
34397 EVT VT) const {
34398 // Don't convert an 'and' into a shuffle that we don't directly support.
34399 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
34400 if (!Subtarget.hasAVX2())
34401 if (VT == MVT::v32i8 || VT == MVT::v16i16)
34402 return false;
34403
34404 // Just delegate to the generic legality, clear masks aren't special.
34405 return isShuffleMaskLegal(Mask, VT);
34406}
34407
34409 // If the subtarget is using thunks, we need to not generate jump tables.
34410 if (Subtarget.useIndirectThunkBranches())
34411 return false;
34412
34413 // Otherwise, fallback on the generic logic.
34415}
34416
34418 EVT ConditionVT) const {
34419 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
34420 // zero-extensions.
34421 if (ConditionVT.getSizeInBits() < 32)
34422 return MVT::i32;
34424 ConditionVT);
34425}
34426
34427//===----------------------------------------------------------------------===//
34428// X86 Scheduler Hooks
34429//===----------------------------------------------------------------------===//
34430
34431// Returns true if EFLAG is consumed after this iterator in the rest of the
34432// basic block or any successors of the basic block.
34434 MachineBasicBlock *BB) {
34435 // Scan forward through BB for a use/def of EFLAGS.
34436 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
34437 if (mi.readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
34438 return true;
34439 // If we found a def, we can stop searching.
34440 if (mi.definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
34441 return false;
34442 }
34443
34444 // If we hit the end of the block, check whether EFLAGS is live into a
34445 // successor.
34446 for (MachineBasicBlock *Succ : BB->successors())
34447 if (Succ->isLiveIn(X86::EFLAGS))
34448 return true;
34449
34450 return false;
34451}
34452
34453/// Utility function to emit xbegin specifying the start of an RTM region.
34455 const TargetInstrInfo *TII) {
34456 const MIMetadata MIMD(MI);
34457
34458 const BasicBlock *BB = MBB->getBasicBlock();
34460
34461 // For the v = xbegin(), we generate
34462 //
34463 // thisMBB:
34464 // xbegin sinkMBB
34465 //
34466 // mainMBB:
34467 // s0 = -1
34468 //
34469 // fallBB:
34470 // eax = # XABORT_DEF
34471 // s1 = eax
34472 //
34473 // sinkMBB:
34474 // v = phi(s0/mainBB, s1/fallBB)
34475
34476 MachineBasicBlock *thisMBB = MBB;
34477 MachineFunction *MF = MBB->getParent();
34478 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
34479 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
34480 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34481 MF->insert(I, mainMBB);
34482 MF->insert(I, fallMBB);
34483 MF->insert(I, sinkMBB);
34484
34485 if (isEFLAGSLiveAfter(MI, MBB)) {
34486 mainMBB->addLiveIn(X86::EFLAGS);
34487 fallMBB->addLiveIn(X86::EFLAGS);
34488 sinkMBB->addLiveIn(X86::EFLAGS);
34489 }
34490
34491 // Transfer the remainder of BB and its successor edges to sinkMBB.
34492 sinkMBB->splice(sinkMBB->begin(), MBB,
34493 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34495
34497 Register DstReg = MI.getOperand(0).getReg();
34498 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
34499 Register mainDstReg = MRI.createVirtualRegister(RC);
34500 Register fallDstReg = MRI.createVirtualRegister(RC);
34501
34502 // thisMBB:
34503 // xbegin fallMBB
34504 // # fallthrough to mainMBB
34505 // # abortion to fallMBB
34506 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
34507 thisMBB->addSuccessor(mainMBB);
34508 thisMBB->addSuccessor(fallMBB);
34509
34510 // mainMBB:
34511 // mainDstReg := -1
34512 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
34513 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
34514 mainMBB->addSuccessor(sinkMBB);
34515
34516 // fallMBB:
34517 // ; pseudo instruction to model hardware's definition from XABORT
34518 // EAX := XABORT_DEF
34519 // fallDstReg := EAX
34520 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
34521 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
34522 .addReg(X86::EAX);
34523 fallMBB->addSuccessor(sinkMBB);
34524
34525 // sinkMBB:
34526 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
34527 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
34528 .addReg(mainDstReg).addMBB(mainMBB)
34529 .addReg(fallDstReg).addMBB(fallMBB);
34530
34531 MI.eraseFromParent();
34532 return sinkMBB;
34533}
34534
34536X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
34537 MachineBasicBlock *MBB) const {
34538 // Emit va_arg instruction on X86-64.
34539
34540 // Operands to this pseudo-instruction:
34541 // 0 ) Output : destination address (reg)
34542 // 1-5) Input : va_list address (addr, i64mem)
34543 // 6 ) ArgSize : Size (in bytes) of vararg type
34544 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
34545 // 8 ) Align : Alignment of type
34546 // 9 ) EFLAGS (implicit-def)
34547
34548 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
34549 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
34550
34551 Register DestReg = MI.getOperand(0).getReg();
34552 MachineOperand &Base = MI.getOperand(1);
34553 MachineOperand &Scale = MI.getOperand(2);
34554 MachineOperand &Index = MI.getOperand(3);
34555 MachineOperand &Disp = MI.getOperand(4);
34556 MachineOperand &Segment = MI.getOperand(5);
34557 unsigned ArgSize = MI.getOperand(6).getImm();
34558 unsigned ArgMode = MI.getOperand(7).getImm();
34559 Align Alignment = Align(MI.getOperand(8).getImm());
34560
34561 MachineFunction *MF = MBB->getParent();
34562
34563 // Memory Reference
34564 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
34565
34566 MachineMemOperand *OldMMO = MI.memoperands().front();
34567
34568 // Clone the MMO into two separate MMOs for loading and storing
34569 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
34570 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
34571 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
34572 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
34573
34574 // Machine Information
34575 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34577 const TargetRegisterClass *AddrRegClass =
34579 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
34580 const MIMetadata MIMD(MI);
34581
34582 // struct va_list {
34583 // i32 gp_offset
34584 // i32 fp_offset
34585 // i64 overflow_area (address)
34586 // i64 reg_save_area (address)
34587 // }
34588 // sizeof(va_list) = 24
34589 // alignment(va_list) = 8
34590
34591 unsigned TotalNumIntRegs = 6;
34592 unsigned TotalNumXMMRegs = 8;
34593 bool UseGPOffset = (ArgMode == 1);
34594 bool UseFPOffset = (ArgMode == 2);
34595 unsigned MaxOffset = TotalNumIntRegs * 8 +
34596 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
34597
34598 /* Align ArgSize to a multiple of 8 */
34599 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
34600 bool NeedsAlign = (Alignment > 8);
34601
34602 MachineBasicBlock *thisMBB = MBB;
34603 MachineBasicBlock *overflowMBB;
34604 MachineBasicBlock *offsetMBB;
34605 MachineBasicBlock *endMBB;
34606
34607 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
34608 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
34609 unsigned OffsetReg = 0;
34610
34611 if (!UseGPOffset && !UseFPOffset) {
34612 // If we only pull from the overflow region, we don't create a branch.
34613 // We don't need to alter control flow.
34614 OffsetDestReg = 0; // unused
34615 OverflowDestReg = DestReg;
34616
34617 offsetMBB = nullptr;
34618 overflowMBB = thisMBB;
34619 endMBB = thisMBB;
34620 } else {
34621 // First emit code to check if gp_offset (or fp_offset) is below the bound.
34622 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
34623 // If not, pull from overflow_area. (branch to overflowMBB)
34624 //
34625 // thisMBB
34626 // | .
34627 // | .
34628 // offsetMBB overflowMBB
34629 // | .
34630 // | .
34631 // endMBB
34632
34633 // Registers for the PHI in endMBB
34634 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
34635 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
34636
34637 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34638 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34639 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34640 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34641
34643
34644 // Insert the new basic blocks
34645 MF->insert(MBBIter, offsetMBB);
34646 MF->insert(MBBIter, overflowMBB);
34647 MF->insert(MBBIter, endMBB);
34648
34649 // Transfer the remainder of MBB and its successor edges to endMBB.
34650 endMBB->splice(endMBB->begin(), thisMBB,
34651 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
34652 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
34653
34654 // Make offsetMBB and overflowMBB successors of thisMBB
34655 thisMBB->addSuccessor(offsetMBB);
34656 thisMBB->addSuccessor(overflowMBB);
34657
34658 // endMBB is a successor of both offsetMBB and overflowMBB
34659 offsetMBB->addSuccessor(endMBB);
34660 overflowMBB->addSuccessor(endMBB);
34661
34662 // Load the offset value into a register
34663 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34664 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
34665 .add(Base)
34666 .add(Scale)
34667 .add(Index)
34668 .addDisp(Disp, UseFPOffset ? 4 : 0)
34669 .add(Segment)
34670 .setMemRefs(LoadOnlyMMO);
34671
34672 // Check if there is enough room left to pull this argument.
34673 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
34674 .addReg(OffsetReg)
34675 .addImm(MaxOffset + 8 - ArgSizeA8);
34676
34677 // Branch to "overflowMBB" if offset >= max
34678 // Fall through to "offsetMBB" otherwise
34679 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
34680 .addMBB(overflowMBB).addImm(X86::COND_AE);
34681 }
34682
34683 // In offsetMBB, emit code to use the reg_save_area.
34684 if (offsetMBB) {
34685 assert(OffsetReg != 0);
34686
34687 // Read the reg_save_area address.
34688 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
34689 BuildMI(
34690 offsetMBB, MIMD,
34691 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34692 RegSaveReg)
34693 .add(Base)
34694 .add(Scale)
34695 .add(Index)
34696 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
34697 .add(Segment)
34698 .setMemRefs(LoadOnlyMMO);
34699
34700 if (Subtarget.isTarget64BitLP64()) {
34701 // Zero-extend the offset
34702 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
34703 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
34704 .addImm(0)
34705 .addReg(OffsetReg)
34706 .addImm(X86::sub_32bit);
34707
34708 // Add the offset to the reg_save_area to get the final address.
34709 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
34710 .addReg(OffsetReg64)
34711 .addReg(RegSaveReg);
34712 } else {
34713 // Add the offset to the reg_save_area to get the final address.
34714 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
34715 .addReg(OffsetReg)
34716 .addReg(RegSaveReg);
34717 }
34718
34719 // Compute the offset for the next argument
34720 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34721 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
34722 .addReg(OffsetReg)
34723 .addImm(UseFPOffset ? 16 : 8);
34724
34725 // Store it back into the va_list.
34726 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
34727 .add(Base)
34728 .add(Scale)
34729 .add(Index)
34730 .addDisp(Disp, UseFPOffset ? 4 : 0)
34731 .add(Segment)
34732 .addReg(NextOffsetReg)
34733 .setMemRefs(StoreOnlyMMO);
34734
34735 // Jump to endMBB
34736 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
34737 .addMBB(endMBB);
34738 }
34739
34740 //
34741 // Emit code to use overflow area
34742 //
34743
34744 // Load the overflow_area address into a register.
34745 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
34746 BuildMI(overflowMBB, MIMD,
34747 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34748 OverflowAddrReg)
34749 .add(Base)
34750 .add(Scale)
34751 .add(Index)
34752 .addDisp(Disp, 8)
34753 .add(Segment)
34754 .setMemRefs(LoadOnlyMMO);
34755
34756 // If we need to align it, do so. Otherwise, just copy the address
34757 // to OverflowDestReg.
34758 if (NeedsAlign) {
34759 // Align the overflow address
34760 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
34761
34762 // aligned_addr = (addr + (align-1)) & ~(align-1)
34763 BuildMI(
34764 overflowMBB, MIMD,
34765 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34766 TmpReg)
34767 .addReg(OverflowAddrReg)
34768 .addImm(Alignment.value() - 1);
34769
34770 BuildMI(
34771 overflowMBB, MIMD,
34772 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
34773 OverflowDestReg)
34774 .addReg(TmpReg)
34775 .addImm(~(uint64_t)(Alignment.value() - 1));
34776 } else {
34777 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
34778 .addReg(OverflowAddrReg);
34779 }
34780
34781 // Compute the next overflow address after this argument.
34782 // (the overflow address should be kept 8-byte aligned)
34783 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
34784 BuildMI(
34785 overflowMBB, MIMD,
34786 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34787 NextAddrReg)
34788 .addReg(OverflowDestReg)
34789 .addImm(ArgSizeA8);
34790
34791 // Store the new overflow address.
34792 BuildMI(overflowMBB, MIMD,
34793 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
34794 .add(Base)
34795 .add(Scale)
34796 .add(Index)
34797 .addDisp(Disp, 8)
34798 .add(Segment)
34799 .addReg(NextAddrReg)
34800 .setMemRefs(StoreOnlyMMO);
34801
34802 // If we branched, emit the PHI to the front of endMBB.
34803 if (offsetMBB) {
34804 BuildMI(*endMBB, endMBB->begin(), MIMD,
34805 TII->get(X86::PHI), DestReg)
34806 .addReg(OffsetDestReg).addMBB(offsetMBB)
34807 .addReg(OverflowDestReg).addMBB(overflowMBB);
34808 }
34809
34810 // Erase the pseudo instruction
34811 MI.eraseFromParent();
34812
34813 return endMBB;
34814}
34815
34816// The EFLAGS operand of SelectItr might be missing a kill marker
34817// because there were multiple uses of EFLAGS, and ISel didn't know
34818// which to mark. Figure out whether SelectItr should have had a
34819// kill marker, and set it if it should. Returns the correct kill
34820// marker value.
34823 const TargetRegisterInfo* TRI) {
34824 if (isEFLAGSLiveAfter(SelectItr, BB))
34825 return false;
34826
34827 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
34828 // out. SelectMI should have a kill flag on EFLAGS.
34829 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
34830 return true;
34831}
34832
34833// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
34834// together with other CMOV pseudo-opcodes into a single basic-block with
34835// conditional jump around it.
34837 switch (MI.getOpcode()) {
34838 case X86::CMOV_FR16:
34839 case X86::CMOV_FR16X:
34840 case X86::CMOV_FR32:
34841 case X86::CMOV_FR32X:
34842 case X86::CMOV_FR64:
34843 case X86::CMOV_FR64X:
34844 case X86::CMOV_GR8:
34845 case X86::CMOV_GR16:
34846 case X86::CMOV_GR32:
34847 case X86::CMOV_RFP32:
34848 case X86::CMOV_RFP64:
34849 case X86::CMOV_RFP80:
34850 case X86::CMOV_VR64:
34851 case X86::CMOV_VR128:
34852 case X86::CMOV_VR128X:
34853 case X86::CMOV_VR256:
34854 case X86::CMOV_VR256X:
34855 case X86::CMOV_VR512:
34856 case X86::CMOV_VK1:
34857 case X86::CMOV_VK2:
34858 case X86::CMOV_VK4:
34859 case X86::CMOV_VK8:
34860 case X86::CMOV_VK16:
34861 case X86::CMOV_VK32:
34862 case X86::CMOV_VK64:
34863 return true;
34864
34865 default:
34866 return false;
34867 }
34868}
34869
34870// Helper function, which inserts PHI functions into SinkMBB:
34871// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
34872// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
34873// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
34874// the last PHI function inserted.
34877 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
34878 MachineBasicBlock *SinkMBB) {
34879 MachineFunction *MF = TrueMBB->getParent();
34881 const MIMetadata MIMD(*MIItBegin);
34882
34883 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
34885
34886 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
34887
34888 // As we are creating the PHIs, we have to be careful if there is more than
34889 // one. Later CMOVs may reference the results of earlier CMOVs, but later
34890 // PHIs have to reference the individual true/false inputs from earlier PHIs.
34891 // That also means that PHI construction must work forward from earlier to
34892 // later, and that the code must maintain a mapping from earlier PHI's
34893 // destination registers, and the registers that went into the PHI.
34896
34897 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
34898 Register DestReg = MIIt->getOperand(0).getReg();
34899 Register Op1Reg = MIIt->getOperand(1).getReg();
34900 Register Op2Reg = MIIt->getOperand(2).getReg();
34901
34902 // If this CMOV we are generating is the opposite condition from
34903 // the jump we generated, then we have to swap the operands for the
34904 // PHI that is going to be generated.
34905 if (MIIt->getOperand(3).getImm() == OppCC)
34906 std::swap(Op1Reg, Op2Reg);
34907
34908 if (RegRewriteTable.contains(Op1Reg))
34909 Op1Reg = RegRewriteTable[Op1Reg].first;
34910
34911 if (RegRewriteTable.contains(Op2Reg))
34912 Op2Reg = RegRewriteTable[Op2Reg].second;
34913
34914 MIB =
34915 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
34916 .addReg(Op1Reg)
34917 .addMBB(FalseMBB)
34918 .addReg(Op2Reg)
34919 .addMBB(TrueMBB);
34920
34921 // Add this PHI to the rewrite table.
34922 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
34923 }
34924
34925 return MIB;
34926}
34927
34928// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
34930X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
34931 MachineInstr &SecondCascadedCMOV,
34932 MachineBasicBlock *ThisMBB) const {
34933 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34934 const MIMetadata MIMD(FirstCMOV);
34935
34936 // We lower cascaded CMOVs such as
34937 //
34938 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
34939 //
34940 // to two successive branches.
34941 //
34942 // Without this, we would add a PHI between the two jumps, which ends up
34943 // creating a few copies all around. For instance, for
34944 //
34945 // (sitofp (zext (fcmp une)))
34946 //
34947 // we would generate:
34948 //
34949 // ucomiss %xmm1, %xmm0
34950 // movss <1.0f>, %xmm0
34951 // movaps %xmm0, %xmm1
34952 // jne .LBB5_2
34953 // xorps %xmm1, %xmm1
34954 // .LBB5_2:
34955 // jp .LBB5_4
34956 // movaps %xmm1, %xmm0
34957 // .LBB5_4:
34958 // retq
34959 //
34960 // because this custom-inserter would have generated:
34961 //
34962 // A
34963 // | \
34964 // | B
34965 // | /
34966 // C
34967 // | \
34968 // | D
34969 // | /
34970 // E
34971 //
34972 // A: X = ...; Y = ...
34973 // B: empty
34974 // C: Z = PHI [X, A], [Y, B]
34975 // D: empty
34976 // E: PHI [X, C], [Z, D]
34977 //
34978 // If we lower both CMOVs in a single step, we can instead generate:
34979 //
34980 // A
34981 // | \
34982 // | C
34983 // | /|
34984 // |/ |
34985 // | |
34986 // | D
34987 // | /
34988 // E
34989 //
34990 // A: X = ...; Y = ...
34991 // D: empty
34992 // E: PHI [X, A], [X, C], [Y, D]
34993 //
34994 // Which, in our sitofp/fcmp example, gives us something like:
34995 //
34996 // ucomiss %xmm1, %xmm0
34997 // movss <1.0f>, %xmm0
34998 // jne .LBB5_4
34999 // jp .LBB5_4
35000 // xorps %xmm0, %xmm0
35001 // .LBB5_4:
35002 // retq
35003 //
35004
35005 // We lower cascaded CMOV into two successive branches to the same block.
35006 // EFLAGS is used by both, so mark it as live in the second.
35007 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35008 MachineFunction *F = ThisMBB->getParent();
35009 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35010 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35011 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35012
35013 MachineFunction::iterator It = ++ThisMBB->getIterator();
35014 F->insert(It, FirstInsertedMBB);
35015 F->insert(It, SecondInsertedMBB);
35016 F->insert(It, SinkMBB);
35017
35018 // For a cascaded CMOV, we lower it to two successive branches to
35019 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
35020 // the FirstInsertedMBB.
35021 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
35022
35023 // If the EFLAGS register isn't dead in the terminator, then claim that it's
35024 // live into the sink and copy blocks.
35025 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35026 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
35027 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
35028 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
35029 SinkMBB->addLiveIn(X86::EFLAGS);
35030 }
35031
35032 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35033 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
35034 std::next(MachineBasicBlock::iterator(FirstCMOV)),
35035 ThisMBB->end());
35036 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35037
35038 // Fallthrough block for ThisMBB.
35039 ThisMBB->addSuccessor(FirstInsertedMBB);
35040 // The true block target of the first branch is always SinkMBB.
35041 ThisMBB->addSuccessor(SinkMBB);
35042 // Fallthrough block for FirstInsertedMBB.
35043 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
35044 // The true block for the branch of FirstInsertedMBB.
35045 FirstInsertedMBB->addSuccessor(SinkMBB);
35046 // This is fallthrough.
35047 SecondInsertedMBB->addSuccessor(SinkMBB);
35048
35049 // Create the conditional branch instructions.
35050 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
35051 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
35052
35053 X86::CondCode SecondCC =
35054 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
35055 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
35056 .addMBB(SinkMBB)
35057 .addImm(SecondCC);
35058
35059 // SinkMBB:
35060 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
35061 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
35062 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
35063 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
35065 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
35066 .addReg(Op1Reg)
35067 .addMBB(SecondInsertedMBB)
35068 .addReg(Op2Reg)
35069 .addMBB(ThisMBB);
35070
35071 // The second SecondInsertedMBB provides the same incoming value as the
35072 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
35073 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
35074
35075 // Now remove the CMOVs.
35076 FirstCMOV.eraseFromParent();
35077 SecondCascadedCMOV.eraseFromParent();
35078
35079 return SinkMBB;
35080}
35081
35083X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
35084 MachineBasicBlock *ThisMBB) const {
35085 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35086 const MIMetadata MIMD(MI);
35087
35088 // To "insert" a SELECT_CC instruction, we actually have to insert the
35089 // diamond control-flow pattern. The incoming instruction knows the
35090 // destination vreg to set, the condition code register to branch on, the
35091 // true/false values to select between and a branch opcode to use.
35092
35093 // ThisMBB:
35094 // ...
35095 // TrueVal = ...
35096 // cmpTY ccX, r1, r2
35097 // bCC copy1MBB
35098 // fallthrough --> FalseMBB
35099
35100 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
35101 // as described above, by inserting a BB, and then making a PHI at the join
35102 // point to select the true and false operands of the CMOV in the PHI.
35103 //
35104 // The code also handles two different cases of multiple CMOV opcodes
35105 // in a row.
35106 //
35107 // Case 1:
35108 // In this case, there are multiple CMOVs in a row, all which are based on
35109 // the same condition setting (or the exact opposite condition setting).
35110 // In this case we can lower all the CMOVs using a single inserted BB, and
35111 // then make a number of PHIs at the join point to model the CMOVs. The only
35112 // trickiness here, is that in a case like:
35113 //
35114 // t2 = CMOV cond1 t1, f1
35115 // t3 = CMOV cond1 t2, f2
35116 //
35117 // when rewriting this into PHIs, we have to perform some renaming on the
35118 // temps since you cannot have a PHI operand refer to a PHI result earlier
35119 // in the same block. The "simple" but wrong lowering would be:
35120 //
35121 // t2 = PHI t1(BB1), f1(BB2)
35122 // t3 = PHI t2(BB1), f2(BB2)
35123 //
35124 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
35125 // renaming is to note that on the path through BB1, t2 is really just a
35126 // copy of t1, and do that renaming, properly generating:
35127 //
35128 // t2 = PHI t1(BB1), f1(BB2)
35129 // t3 = PHI t1(BB1), f2(BB2)
35130 //
35131 // Case 2:
35132 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
35133 // function - EmitLoweredCascadedSelect.
35134
35135 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
35137 MachineInstr *LastCMOV = &MI;
35139
35140 // Check for case 1, where there are multiple CMOVs with the same condition
35141 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
35142 // number of jumps the most.
35143
35144 if (isCMOVPseudo(MI)) {
35145 // See if we have a string of CMOVS with the same condition. Skip over
35146 // intervening debug insts.
35147 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
35148 (NextMIIt->getOperand(3).getImm() == CC ||
35149 NextMIIt->getOperand(3).getImm() == OppCC)) {
35150 LastCMOV = &*NextMIIt;
35151 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
35152 }
35153 }
35154
35155 // This checks for case 2, but only do this if we didn't already find
35156 // case 1, as indicated by LastCMOV == MI.
35157 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
35158 NextMIIt->getOpcode() == MI.getOpcode() &&
35159 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
35160 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
35161 NextMIIt->getOperand(1).isKill()) {
35162 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
35163 }
35164
35165 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35166 MachineFunction *F = ThisMBB->getParent();
35167 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
35168 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35169
35170 MachineFunction::iterator It = ++ThisMBB->getIterator();
35171 F->insert(It, FalseMBB);
35172 F->insert(It, SinkMBB);
35173
35174 // Set the call frame size on entry to the new basic blocks.
35175 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
35176 FalseMBB->setCallFrameSize(CallFrameSize);
35177 SinkMBB->setCallFrameSize(CallFrameSize);
35178
35179 // If the EFLAGS register isn't dead in the terminator, then claim that it's
35180 // live into the sink and copy blocks.
35181 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35182 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
35183 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
35184 FalseMBB->addLiveIn(X86::EFLAGS);
35185 SinkMBB->addLiveIn(X86::EFLAGS);
35186 }
35187
35188 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
35190 MachineBasicBlock::iterator(LastCMOV));
35191 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
35192 if (MI.isDebugInstr())
35193 SinkMBB->push_back(MI.removeFromParent());
35194
35195 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35196 SinkMBB->splice(SinkMBB->end(), ThisMBB,
35197 std::next(MachineBasicBlock::iterator(LastCMOV)),
35198 ThisMBB->end());
35199 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35200
35201 // Fallthrough block for ThisMBB.
35202 ThisMBB->addSuccessor(FalseMBB);
35203 // The true block target of the first (or only) branch is always a SinkMBB.
35204 ThisMBB->addSuccessor(SinkMBB);
35205 // Fallthrough block for FalseMBB.
35206 FalseMBB->addSuccessor(SinkMBB);
35207
35208 // Create the conditional branch instruction.
35209 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
35210
35211 // SinkMBB:
35212 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
35213 // ...
35216 std::next(MachineBasicBlock::iterator(LastCMOV));
35217 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
35218
35219 // Now remove the CMOV(s).
35220 ThisMBB->erase(MIItBegin, MIItEnd);
35221
35222 return SinkMBB;
35223}
35224
35225static unsigned getSUBriOpcode(bool IsLP64) {
35226 if (IsLP64)
35227 return X86::SUB64ri32;
35228 else
35229 return X86::SUB32ri;
35230}
35231
35233X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
35234 MachineBasicBlock *MBB) const {
35235 MachineFunction *MF = MBB->getParent();
35236 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35237 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
35238 const MIMetadata MIMD(MI);
35239 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35240
35241 const unsigned ProbeSize = getStackProbeSize(*MF);
35242
35244 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35245 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35246 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35247
35249 MF->insert(MBBIter, testMBB);
35250 MF->insert(MBBIter, blockMBB);
35251 MF->insert(MBBIter, tailMBB);
35252
35253 Register sizeVReg = MI.getOperand(1).getReg();
35254
35255 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
35256
35257 Register TmpStackPtr = MRI.createVirtualRegister(
35258 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35259 Register FinalStackPtr = MRI.createVirtualRegister(
35260 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35261
35262 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
35263 .addReg(physSPReg);
35264 {
35265 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
35266 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
35267 .addReg(TmpStackPtr)
35268 .addReg(sizeVReg);
35269 }
35270
35271 // test rsp size
35272
35273 BuildMI(testMBB, MIMD,
35274 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
35275 .addReg(FinalStackPtr)
35276 .addReg(physSPReg);
35277
35278 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
35279 .addMBB(tailMBB)
35281 testMBB->addSuccessor(blockMBB);
35282 testMBB->addSuccessor(tailMBB);
35283
35284 // Touch the block then extend it. This is done on the opposite side of
35285 // static probe where we allocate then touch, to avoid the need of probing the
35286 // tail of the static alloca. Possible scenarios are:
35287 //
35288 // + ---- <- ------------ <- ------------- <- ------------ +
35289 // | |
35290 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
35291 // | |
35292 // + <- ----------- <- ------------ <- ----------- <- ------------ +
35293 //
35294 // The property we want to enforce is to never have more than [page alloc] between two probes.
35295
35296 const unsigned XORMIOpc =
35297 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
35298 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
35299 .addImm(0);
35300
35301 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
35302 physSPReg)
35303 .addReg(physSPReg)
35304 .addImm(ProbeSize);
35305
35306 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
35307 blockMBB->addSuccessor(testMBB);
35308
35309 // Replace original instruction by the expected stack ptr
35310 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
35311 MI.getOperand(0).getReg())
35312 .addReg(FinalStackPtr);
35313
35314 tailMBB->splice(tailMBB->end(), MBB,
35315 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35317 MBB->addSuccessor(testMBB);
35318
35319 // Delete the original pseudo instruction.
35320 MI.eraseFromParent();
35321
35322 // And we're done.
35323 return tailMBB;
35324}
35325
35327X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
35328 MachineBasicBlock *BB) const {
35329 MachineFunction *MF = BB->getParent();
35330 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35331 const MIMetadata MIMD(MI);
35332 const BasicBlock *LLVM_BB = BB->getBasicBlock();
35333
35334 assert(MF->shouldSplitStack());
35335
35336 const bool Is64Bit = Subtarget.is64Bit();
35337 const bool IsLP64 = Subtarget.isTarget64BitLP64();
35338
35339 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
35340 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
35341
35342 // BB:
35343 // ... [Till the alloca]
35344 // If stacklet is not large enough, jump to mallocMBB
35345 //
35346 // bumpMBB:
35347 // Allocate by subtracting from RSP
35348 // Jump to continueMBB
35349 //
35350 // mallocMBB:
35351 // Allocate by call to runtime
35352 //
35353 // continueMBB:
35354 // ...
35355 // [rest of original BB]
35356 //
35357
35358 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35359 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35360 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35361
35363 const TargetRegisterClass *AddrRegClass =
35365
35366 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35367 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35368 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
35369 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
35370 sizeVReg = MI.getOperand(1).getReg(),
35371 physSPReg =
35372 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
35373
35374 MachineFunction::iterator MBBIter = ++BB->getIterator();
35375
35376 MF->insert(MBBIter, bumpMBB);
35377 MF->insert(MBBIter, mallocMBB);
35378 MF->insert(MBBIter, continueMBB);
35379
35380 continueMBB->splice(continueMBB->begin(), BB,
35381 std::next(MachineBasicBlock::iterator(MI)), BB->end());
35382 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
35383
35384 // Add code to the main basic block to check if the stack limit has been hit,
35385 // and if so, jump to mallocMBB otherwise to bumpMBB.
35386 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
35387 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
35388 .addReg(tmpSPVReg).addReg(sizeVReg);
35389 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
35390 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
35391 .addReg(SPLimitVReg);
35392 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
35393
35394 // bumpMBB simply decreases the stack pointer, since we know the current
35395 // stacklet has enough space.
35396 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
35397 .addReg(SPLimitVReg);
35398 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
35399 .addReg(SPLimitVReg);
35400 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35401
35402 // Calls into a routine in libgcc to allocate more space from the heap.
35403 const uint32_t *RegMask =
35405 if (IsLP64) {
35406 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
35407 .addReg(sizeVReg);
35408 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35409 .addExternalSymbol("__morestack_allocate_stack_space")
35410 .addRegMask(RegMask)
35411 .addReg(X86::RDI, RegState::Implicit)
35412 .addReg(X86::RAX, RegState::ImplicitDefine);
35413 } else if (Is64Bit) {
35414 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
35415 .addReg(sizeVReg);
35416 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35417 .addExternalSymbol("__morestack_allocate_stack_space")
35418 .addRegMask(RegMask)
35419 .addReg(X86::EDI, RegState::Implicit)
35420 .addReg(X86::EAX, RegState::ImplicitDefine);
35421 } else {
35422 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
35423 .addImm(12);
35424 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
35425 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
35426 .addExternalSymbol("__morestack_allocate_stack_space")
35427 .addRegMask(RegMask)
35428 .addReg(X86::EAX, RegState::ImplicitDefine);
35429 }
35430
35431 if (!Is64Bit)
35432 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
35433 .addImm(16);
35434
35435 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
35436 .addReg(IsLP64 ? X86::RAX : X86::EAX);
35437 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35438
35439 // Set up the CFG correctly.
35440 BB->addSuccessor(bumpMBB);
35441 BB->addSuccessor(mallocMBB);
35442 mallocMBB->addSuccessor(continueMBB);
35443 bumpMBB->addSuccessor(continueMBB);
35444
35445 // Take care of the PHI nodes.
35446 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
35447 MI.getOperand(0).getReg())
35448 .addReg(mallocPtrVReg)
35449 .addMBB(mallocMBB)
35450 .addReg(bumpSPPtrVReg)
35451 .addMBB(bumpMBB);
35452
35453 // Delete the original pseudo instruction.
35454 MI.eraseFromParent();
35455
35456 // And we're done.
35457 return continueMBB;
35458}
35459
35461X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
35462 MachineBasicBlock *BB) const {
35463 MachineFunction *MF = BB->getParent();
35464 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35465 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
35466 const MIMetadata MIMD(MI);
35467
35470 "SEH does not use catchret!");
35471
35472 // Only 32-bit EH needs to worry about manually restoring stack pointers.
35473 if (!Subtarget.is32Bit())
35474 return BB;
35475
35476 // C++ EH creates a new target block to hold the restore code, and wires up
35477 // the new block to the return destination with a normal JMP_4.
35478 MachineBasicBlock *RestoreMBB =
35480 assert(BB->succ_size() == 1);
35481 MF->insert(std::next(BB->getIterator()), RestoreMBB);
35482 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
35483 BB->addSuccessor(RestoreMBB);
35484 MI.getOperand(0).setMBB(RestoreMBB);
35485
35486 // Marking this as an EH pad but not a funclet entry block causes PEI to
35487 // restore stack pointers in the block.
35488 RestoreMBB->setIsEHPad(true);
35489
35490 auto RestoreMBBI = RestoreMBB->begin();
35491 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
35492 return BB;
35493}
35494
35496X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
35497 MachineBasicBlock *BB) const {
35498 // So, here we replace TLSADDR with the sequence:
35499 // adjust_stackdown -> TLSADDR -> adjust_stackup.
35500 // We need this because TLSADDR is lowered into calls
35501 // inside MC, therefore without the two markers shrink-wrapping
35502 // may push the prologue/epilogue pass them.
35503 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35504 const MIMetadata MIMD(MI);
35505 MachineFunction &MF = *BB->getParent();
35506
35507 // Emit CALLSEQ_START right before the instruction.
35508 MF.getFrameInfo().setAdjustsStack(true);
35509 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
35510 MachineInstrBuilder CallseqStart =
35511 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
35512 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
35513
35514 // Emit CALLSEQ_END right after the instruction.
35515 // We don't call erase from parent because we want to keep the
35516 // original instruction around.
35517 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
35518 MachineInstrBuilder CallseqEnd =
35519 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
35520 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
35521
35522 return BB;
35523}
35524
35526X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
35527 MachineBasicBlock *BB) const {
35528 // This is pretty easy. We're taking the value that we received from
35529 // our load from the relocation, sticking it in either RDI (x86-64)
35530 // or EAX and doing an indirect call. The return value will then
35531 // be in the normal return register.
35532 MachineFunction *F = BB->getParent();
35533 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35534 const MIMetadata MIMD(MI);
35535
35536 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
35537 assert(MI.getOperand(3).isGlobal() && "This should be a global");
35538
35539 // Get a register mask for the lowered call.
35540 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
35541 // proper register mask.
35542 const uint32_t *RegMask =
35543 Subtarget.is64Bit() ?
35546 if (Subtarget.is64Bit()) {
35548 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
35549 .addReg(X86::RIP)
35550 .addImm(0)
35551 .addReg(0)
35552 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35553 MI.getOperand(3).getTargetFlags())
35554 .addReg(0);
35555 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
35556 addDirectMem(MIB, X86::RDI);
35557 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
35558 } else if (!isPositionIndependent()) {
35560 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35561 .addReg(0)
35562 .addImm(0)
35563 .addReg(0)
35564 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35565 MI.getOperand(3).getTargetFlags())
35566 .addReg(0);
35567 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35568 addDirectMem(MIB, X86::EAX);
35569 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35570 } else {
35572 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35573 .addReg(TII->getGlobalBaseReg(F))
35574 .addImm(0)
35575 .addReg(0)
35576 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35577 MI.getOperand(3).getTargetFlags())
35578 .addReg(0);
35579 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35580 addDirectMem(MIB, X86::EAX);
35581 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35582 }
35583
35584 MI.eraseFromParent(); // The pseudo instruction is gone now.
35585 return BB;
35586}
35587
35588static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
35589 switch (RPOpc) {
35590 case X86::INDIRECT_THUNK_CALL32:
35591 return X86::CALLpcrel32;
35592 case X86::INDIRECT_THUNK_CALL64:
35593 return X86::CALL64pcrel32;
35594 case X86::INDIRECT_THUNK_TCRETURN32:
35595 return X86::TCRETURNdi;
35596 case X86::INDIRECT_THUNK_TCRETURN64:
35597 return X86::TCRETURNdi64;
35598 }
35599 llvm_unreachable("not indirect thunk opcode");
35600}
35601
35602static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
35603 unsigned Reg) {
35604 if (Subtarget.useRetpolineExternalThunk()) {
35605 // When using an external thunk for retpolines, we pick names that match the
35606 // names GCC happens to use as well. This helps simplify the implementation
35607 // of the thunks for kernels where they have no easy ability to create
35608 // aliases and are doing non-trivial configuration of the thunk's body. For
35609 // example, the Linux kernel will do boot-time hot patching of the thunk
35610 // bodies and cannot easily export aliases of these to loaded modules.
35611 //
35612 // Note that at any point in the future, we may need to change the semantics
35613 // of how we implement retpolines and at that time will likely change the
35614 // name of the called thunk. Essentially, there is no hard guarantee that
35615 // LLVM will generate calls to specific thunks, we merely make a best-effort
35616 // attempt to help out kernels and other systems where duplicating the
35617 // thunks is costly.
35618 switch (Reg) {
35619 case X86::EAX:
35620 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35621 return "__x86_indirect_thunk_eax";
35622 case X86::ECX:
35623 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35624 return "__x86_indirect_thunk_ecx";
35625 case X86::EDX:
35626 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35627 return "__x86_indirect_thunk_edx";
35628 case X86::EDI:
35629 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35630 return "__x86_indirect_thunk_edi";
35631 case X86::R11:
35632 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35633 return "__x86_indirect_thunk_r11";
35634 }
35635 llvm_unreachable("unexpected reg for external indirect thunk");
35636 }
35637
35638 if (Subtarget.useRetpolineIndirectCalls() ||
35639 Subtarget.useRetpolineIndirectBranches()) {
35640 // When targeting an internal COMDAT thunk use an LLVM-specific name.
35641 switch (Reg) {
35642 case X86::EAX:
35643 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35644 return "__llvm_retpoline_eax";
35645 case X86::ECX:
35646 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35647 return "__llvm_retpoline_ecx";
35648 case X86::EDX:
35649 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35650 return "__llvm_retpoline_edx";
35651 case X86::EDI:
35652 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35653 return "__llvm_retpoline_edi";
35654 case X86::R11:
35655 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35656 return "__llvm_retpoline_r11";
35657 }
35658 llvm_unreachable("unexpected reg for retpoline");
35659 }
35660
35661 if (Subtarget.useLVIControlFlowIntegrity()) {
35662 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35663 return "__llvm_lvi_thunk_r11";
35664 }
35665 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
35666}
35667
35669X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
35670 MachineBasicBlock *BB) const {
35671 // Copy the virtual register into the R11 physical register and
35672 // call the retpoline thunk.
35673 const MIMetadata MIMD(MI);
35674 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35675 Register CalleeVReg = MI.getOperand(0).getReg();
35676 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
35677
35678 // Find an available scratch register to hold the callee. On 64-bit, we can
35679 // just use R11, but we scan for uses anyway to ensure we don't generate
35680 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
35681 // already a register use operand to the call to hold the callee. If none
35682 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
35683 // register and ESI is the base pointer to realigned stack frames with VLAs.
35684 SmallVector<unsigned, 3> AvailableRegs;
35685 if (Subtarget.is64Bit())
35686 AvailableRegs.push_back(X86::R11);
35687 else
35688 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
35689
35690 // Zero out any registers that are already used.
35691 for (const auto &MO : MI.operands()) {
35692 if (MO.isReg() && MO.isUse())
35693 for (unsigned &Reg : AvailableRegs)
35694 if (Reg == MO.getReg())
35695 Reg = 0;
35696 }
35697
35698 // Choose the first remaining non-zero available register.
35699 unsigned AvailableReg = 0;
35700 for (unsigned MaybeReg : AvailableRegs) {
35701 if (MaybeReg) {
35702 AvailableReg = MaybeReg;
35703 break;
35704 }
35705 }
35706 if (!AvailableReg)
35707 report_fatal_error("calling convention incompatible with retpoline, no "
35708 "available registers");
35709
35710 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
35711
35712 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
35713 .addReg(CalleeVReg);
35714 MI.getOperand(0).ChangeToES(Symbol);
35715 MI.setDesc(TII->get(Opc));
35717 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
35718 return BB;
35719}
35720
35721/// SetJmp implies future control flow change upon calling the corresponding
35722/// LongJmp.
35723/// Instead of using the 'return' instruction, the long jump fixes the stack and
35724/// performs an indirect branch. To do so it uses the registers that were stored
35725/// in the jump buffer (when calling SetJmp).
35726/// In case the shadow stack is enabled we need to fix it as well, because some
35727/// return addresses will be skipped.
35728/// The function will save the SSP for future fixing in the function
35729/// emitLongJmpShadowStackFix.
35730/// \sa emitLongJmpShadowStackFix
35731/// \param [in] MI The temporary Machine Instruction for the builtin.
35732/// \param [in] MBB The Machine Basic Block that will be modified.
35733void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
35734 MachineBasicBlock *MBB) const {
35735 const MIMetadata MIMD(MI);
35736 MachineFunction *MF = MBB->getParent();
35737 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35740
35741 // Memory Reference.
35742 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35743 MI.memoperands_end());
35744
35745 // Initialize a register with zero.
35746 MVT PVT = getPointerTy(MF->getDataLayout());
35747 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35748 Register ZReg = MRI.createVirtualRegister(PtrRC);
35749 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
35750 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
35751 .addDef(ZReg)
35752 .addReg(ZReg, RegState::Undef)
35753 .addReg(ZReg, RegState::Undef);
35754
35755 // Read the current SSP Register value to the zeroed register.
35756 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35757 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35758 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35759
35760 // Write the SSP register value to offset 3 in input memory buffer.
35761 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35762 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
35763 const int64_t SSPOffset = 3 * PVT.getStoreSize();
35764 const unsigned MemOpndSlot = 1;
35765 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35766 if (i == X86::AddrDisp)
35767 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
35768 else
35769 MIB.add(MI.getOperand(MemOpndSlot + i));
35770 }
35771 MIB.addReg(SSPCopyReg);
35772 MIB.setMemRefs(MMOs);
35773}
35774
35776X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
35777 MachineBasicBlock *MBB) const {
35778 const MIMetadata MIMD(MI);
35779 MachineFunction *MF = MBB->getParent();
35780 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35781 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35783
35784 const BasicBlock *BB = MBB->getBasicBlock();
35786
35787 // Memory Reference
35788 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35789 MI.memoperands_end());
35790
35791 unsigned DstReg;
35792 unsigned MemOpndSlot = 0;
35793
35794 unsigned CurOp = 0;
35795
35796 DstReg = MI.getOperand(CurOp++).getReg();
35797 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35798 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
35799 (void)TRI;
35800 Register mainDstReg = MRI.createVirtualRegister(RC);
35801 Register restoreDstReg = MRI.createVirtualRegister(RC);
35802
35803 MemOpndSlot = CurOp;
35804
35805 MVT PVT = getPointerTy(MF->getDataLayout());
35806 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35807 "Invalid Pointer Size!");
35808
35809 // For v = setjmp(buf), we generate
35810 //
35811 // thisMBB:
35812 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
35813 // SjLjSetup restoreMBB
35814 //
35815 // mainMBB:
35816 // v_main = 0
35817 //
35818 // sinkMBB:
35819 // v = phi(main, restore)
35820 //
35821 // restoreMBB:
35822 // if base pointer being used, load it from frame
35823 // v_restore = 1
35824
35825 MachineBasicBlock *thisMBB = MBB;
35826 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35827 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35828 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
35829 MF->insert(I, mainMBB);
35830 MF->insert(I, sinkMBB);
35831 MF->push_back(restoreMBB);
35832 restoreMBB->setMachineBlockAddressTaken();
35833
35835
35836 // Transfer the remainder of BB and its successor edges to sinkMBB.
35837 sinkMBB->splice(sinkMBB->begin(), MBB,
35838 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35840
35841 // thisMBB:
35842 unsigned PtrStoreOpc = 0;
35843 unsigned LabelReg = 0;
35844 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35845 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35847
35848 // Prepare IP either in reg or imm.
35849 if (!UseImmLabel) {
35850 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35851 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35852 LabelReg = MRI.createVirtualRegister(PtrRC);
35853 if (Subtarget.is64Bit()) {
35854 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
35855 .addReg(X86::RIP)
35856 .addImm(0)
35857 .addReg(0)
35858 .addMBB(restoreMBB)
35859 .addReg(0);
35860 } else {
35861 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
35862 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
35863 .addReg(XII->getGlobalBaseReg(MF))
35864 .addImm(0)
35865 .addReg(0)
35866 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
35867 .addReg(0);
35868 }
35869 } else
35870 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35871 // Store IP
35872 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
35873 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35874 if (i == X86::AddrDisp)
35875 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
35876 else
35877 MIB.add(MI.getOperand(MemOpndSlot + i));
35878 }
35879 if (!UseImmLabel)
35880 MIB.addReg(LabelReg);
35881 else
35882 MIB.addMBB(restoreMBB);
35883 MIB.setMemRefs(MMOs);
35884
35885 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35886 emitSetJmpShadowStackFix(MI, thisMBB);
35887 }
35888
35889 // Setup
35890 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
35891 .addMBB(restoreMBB);
35892
35893 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35894 MIB.addRegMask(RegInfo->getNoPreservedMask());
35895 thisMBB->addSuccessor(mainMBB);
35896 thisMBB->addSuccessor(restoreMBB);
35897
35898 // mainMBB:
35899 // EAX = 0
35900 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
35901 mainMBB->addSuccessor(sinkMBB);
35902
35903 // sinkMBB:
35904 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35905 .addReg(mainDstReg)
35906 .addMBB(mainMBB)
35907 .addReg(restoreDstReg)
35908 .addMBB(restoreMBB);
35909
35910 // restoreMBB:
35911 if (RegInfo->hasBasePointer(*MF)) {
35912 const bool Uses64BitFramePtr =
35913 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35915 X86FI->setRestoreBasePointer(MF);
35916 Register FramePtr = RegInfo->getFrameRegister(*MF);
35917 Register BasePtr = RegInfo->getBaseRegister();
35918 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
35919 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
35920 FramePtr, true, X86FI->getRestoreBasePointerOffset())
35922 }
35923 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
35924 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35925 restoreMBB->addSuccessor(sinkMBB);
35926
35927 MI.eraseFromParent();
35928 return sinkMBB;
35929}
35930
35931/// Fix the shadow stack using the previously saved SSP pointer.
35932/// \sa emitSetJmpShadowStackFix
35933/// \param [in] MI The temporary Machine Instruction for the builtin.
35934/// \param [in] MBB The Machine Basic Block that will be modified.
35935/// \return The sink MBB that will perform the future indirect branch.
35937X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
35938 MachineBasicBlock *MBB) const {
35939 const MIMetadata MIMD(MI);
35940 MachineFunction *MF = MBB->getParent();
35941 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35943
35944 // Memory Reference
35945 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35946 MI.memoperands_end());
35947
35948 MVT PVT = getPointerTy(MF->getDataLayout());
35949 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35950
35951 // checkSspMBB:
35952 // xor vreg1, vreg1
35953 // rdssp vreg1
35954 // test vreg1, vreg1
35955 // je sinkMBB # Jump if Shadow Stack is not supported
35956 // fallMBB:
35957 // mov buf+24/12(%rip), vreg2
35958 // sub vreg1, vreg2
35959 // jbe sinkMBB # No need to fix the Shadow Stack
35960 // fixShadowMBB:
35961 // shr 3/2, vreg2
35962 // incssp vreg2 # fix the SSP according to the lower 8 bits
35963 // shr 8, vreg2
35964 // je sinkMBB
35965 // fixShadowLoopPrepareMBB:
35966 // shl vreg2
35967 // mov 128, vreg3
35968 // fixShadowLoopMBB:
35969 // incssp vreg3
35970 // dec vreg2
35971 // jne fixShadowLoopMBB # Iterate until you finish fixing
35972 // # the Shadow Stack
35973 // sinkMBB:
35974
35976 const BasicBlock *BB = MBB->getBasicBlock();
35977
35978 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
35979 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35980 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
35981 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
35982 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
35983 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35984 MF->insert(I, checkSspMBB);
35985 MF->insert(I, fallMBB);
35986 MF->insert(I, fixShadowMBB);
35987 MF->insert(I, fixShadowLoopPrepareMBB);
35988 MF->insert(I, fixShadowLoopMBB);
35989 MF->insert(I, sinkMBB);
35990
35991 // Transfer the remainder of BB and its successor edges to sinkMBB.
35992 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
35993 MBB->end());
35995
35996 MBB->addSuccessor(checkSspMBB);
35997
35998 // Initialize a register with zero.
35999 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
36000 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
36001
36002 if (PVT == MVT::i64) {
36003 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
36004 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
36005 .addImm(0)
36006 .addReg(ZReg)
36007 .addImm(X86::sub_32bit);
36008 ZReg = TmpZReg;
36009 }
36010
36011 // Read the current SSP Register value to the zeroed register.
36012 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36013 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36014 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36015
36016 // Check whether the result of the SSP register is zero and jump directly
36017 // to the sink.
36018 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
36019 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
36020 .addReg(SSPCopyReg)
36021 .addReg(SSPCopyReg);
36022 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
36023 .addMBB(sinkMBB)
36025 checkSspMBB->addSuccessor(sinkMBB);
36026 checkSspMBB->addSuccessor(fallMBB);
36027
36028 // Reload the previously saved SSP register value.
36029 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
36030 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36031 const int64_t SPPOffset = 3 * PVT.getStoreSize();
36033 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
36034 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36035 const MachineOperand &MO = MI.getOperand(i);
36036 if (i == X86::AddrDisp)
36037 MIB.addDisp(MO, SPPOffset);
36038 else if (MO.isReg()) // Don't add the whole operand, we don't want to
36039 // preserve kill flags.
36040 MIB.addReg(MO.getReg());
36041 else
36042 MIB.add(MO);
36043 }
36044 MIB.setMemRefs(MMOs);
36045
36046 // Subtract the current SSP from the previous SSP.
36047 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
36048 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
36049 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
36050 .addReg(PrevSSPReg)
36051 .addReg(SSPCopyReg);
36052
36053 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
36054 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
36055 .addMBB(sinkMBB)
36057 fallMBB->addSuccessor(sinkMBB);
36058 fallMBB->addSuccessor(fixShadowMBB);
36059
36060 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
36061 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
36062 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
36063 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
36064 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
36065 .addReg(SspSubReg)
36066 .addImm(Offset);
36067
36068 // Increase SSP when looking only on the lower 8 bits of the delta.
36069 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
36070 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
36071
36072 // Reset the lower 8 bits.
36073 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
36074 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
36075 .addReg(SspFirstShrReg)
36076 .addImm(8);
36077
36078 // Jump if the result of the shift is zero.
36079 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
36080 .addMBB(sinkMBB)
36082 fixShadowMBB->addSuccessor(sinkMBB);
36083 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
36084
36085 // Do a single shift left.
36086 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
36087 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
36088 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
36089 .addReg(SspSecondShrReg)
36090 .addImm(1);
36091
36092 // Save the value 128 to a register (will be used next with incssp).
36093 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
36094 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
36095 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
36096 .addImm(128);
36097 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
36098
36099 // Since incssp only looks at the lower 8 bits, we might need to do several
36100 // iterations of incssp until we finish fixing the shadow stack.
36101 Register DecReg = MRI.createVirtualRegister(PtrRC);
36102 Register CounterReg = MRI.createVirtualRegister(PtrRC);
36103 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
36104 .addReg(SspAfterShlReg)
36105 .addMBB(fixShadowLoopPrepareMBB)
36106 .addReg(DecReg)
36107 .addMBB(fixShadowLoopMBB);
36108
36109 // Every iteration we increase the SSP by 128.
36110 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
36111
36112 // Every iteration we decrement the counter by 1.
36113 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
36114 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
36115
36116 // Jump if the counter is not zero yet.
36117 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
36118 .addMBB(fixShadowLoopMBB)
36120 fixShadowLoopMBB->addSuccessor(sinkMBB);
36121 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
36122
36123 return sinkMBB;
36124}
36125
36127X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
36128 MachineBasicBlock *MBB) const {
36129 const MIMetadata MIMD(MI);
36130 MachineFunction *MF = MBB->getParent();
36131 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36133
36134 // Memory Reference
36135 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
36136 MI.memoperands_end());
36137
36138 MVT PVT = getPointerTy(MF->getDataLayout());
36139 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
36140 "Invalid Pointer Size!");
36141
36142 const TargetRegisterClass *RC =
36143 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36144 Register Tmp = MRI.createVirtualRegister(RC);
36145 // Since FP is only updated here but NOT referenced, it's treated as GPR.
36146 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
36147 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
36148 Register SP = RegInfo->getStackRegister();
36149
36151
36152 const int64_t LabelOffset = 1 * PVT.getStoreSize();
36153 const int64_t SPOffset = 2 * PVT.getStoreSize();
36154
36155 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36156 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
36157
36158 MachineBasicBlock *thisMBB = MBB;
36159
36160 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
36161 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
36162 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
36163 }
36164
36165 // Reload FP
36166 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
36167 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36168 const MachineOperand &MO = MI.getOperand(i);
36169 if (MO.isReg()) // Don't add the whole operand, we don't want to
36170 // preserve kill flags.
36171 MIB.addReg(MO.getReg());
36172 else
36173 MIB.add(MO);
36174 }
36175 MIB.setMemRefs(MMOs);
36176
36177 // Reload IP
36178 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
36179 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36180 const MachineOperand &MO = MI.getOperand(i);
36181 if (i == X86::AddrDisp)
36182 MIB.addDisp(MO, LabelOffset);
36183 else if (MO.isReg()) // Don't add the whole operand, we don't want to
36184 // preserve kill flags.
36185 MIB.addReg(MO.getReg());
36186 else
36187 MIB.add(MO);
36188 }
36189 MIB.setMemRefs(MMOs);
36190
36191 // Reload SP
36192 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
36193 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36194 if (i == X86::AddrDisp)
36195 MIB.addDisp(MI.getOperand(i), SPOffset);
36196 else
36197 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
36198 // the last instruction of the expansion.
36199 }
36200 MIB.setMemRefs(MMOs);
36201
36202 // Jump
36203 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
36204
36205 MI.eraseFromParent();
36206 return thisMBB;
36207}
36208
36209void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
36211 MachineBasicBlock *DispatchBB,
36212 int FI) const {
36213 const MIMetadata MIMD(MI);
36214 MachineFunction *MF = MBB->getParent();
36216 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36217
36218 MVT PVT = getPointerTy(MF->getDataLayout());
36219 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
36220
36221 unsigned Op = 0;
36222 unsigned VR = 0;
36223
36224 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36226
36227 if (UseImmLabel) {
36228 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
36229 } else {
36230 const TargetRegisterClass *TRC =
36231 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36232 VR = MRI->createVirtualRegister(TRC);
36233 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36234
36235 if (Subtarget.is64Bit())
36236 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
36237 .addReg(X86::RIP)
36238 .addImm(1)
36239 .addReg(0)
36240 .addMBB(DispatchBB)
36241 .addReg(0);
36242 else
36243 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
36244 .addReg(0) /* TII->getGlobalBaseReg(MF) */
36245 .addImm(1)
36246 .addReg(0)
36247 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
36248 .addReg(0);
36249 }
36250
36251 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
36252 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
36253 if (UseImmLabel)
36254 MIB.addMBB(DispatchBB);
36255 else
36256 MIB.addReg(VR);
36257}
36258
36260X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
36261 MachineBasicBlock *BB) const {
36262 const MIMetadata MIMD(MI);
36263 MachineFunction *MF = BB->getParent();
36265 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36266 int FI = MF->getFrameInfo().getFunctionContextIndex();
36267
36268 // Get a mapping of the call site numbers to all of the landing pads they're
36269 // associated with.
36271 unsigned MaxCSNum = 0;
36272 for (auto &MBB : *MF) {
36273 if (!MBB.isEHPad())
36274 continue;
36275
36276 MCSymbol *Sym = nullptr;
36277 for (const auto &MI : MBB) {
36278 if (MI.isDebugInstr())
36279 continue;
36280
36281 assert(MI.isEHLabel() && "expected EH_LABEL");
36282 Sym = MI.getOperand(0).getMCSymbol();
36283 break;
36284 }
36285
36286 if (!MF->hasCallSiteLandingPad(Sym))
36287 continue;
36288
36289 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
36290 CallSiteNumToLPad[CSI].push_back(&MBB);
36291 MaxCSNum = std::max(MaxCSNum, CSI);
36292 }
36293 }
36294
36295 // Get an ordered list of the machine basic blocks for the jump table.
36296 std::vector<MachineBasicBlock *> LPadList;
36298 LPadList.reserve(CallSiteNumToLPad.size());
36299
36300 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
36301 for (auto &LP : CallSiteNumToLPad[CSI]) {
36302 LPadList.push_back(LP);
36303 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
36304 }
36305 }
36306
36307 assert(!LPadList.empty() &&
36308 "No landing pad destinations for the dispatch jump table!");
36309
36310 // Create the MBBs for the dispatch code.
36311
36312 // Shove the dispatch's address into the return slot in the function context.
36313 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
36314 DispatchBB->setIsEHPad(true);
36315
36316 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
36317 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
36318 DispatchBB->addSuccessor(TrapBB);
36319
36320 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
36321 DispatchBB->addSuccessor(DispContBB);
36322
36323 // Insert MBBs.
36324 MF->push_back(DispatchBB);
36325 MF->push_back(DispContBB);
36326 MF->push_back(TrapBB);
36327
36328 // Insert code into the entry block that creates and registers the function
36329 // context.
36330 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
36331
36332 // Create the jump table and associated information
36333 unsigned JTE = getJumpTableEncoding();
36334 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
36335 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
36336
36337 const X86RegisterInfo &RI = TII->getRegisterInfo();
36338 // Add a register mask with no preserved registers. This results in all
36339 // registers being marked as clobbered.
36340 if (RI.hasBasePointer(*MF)) {
36341 const bool FPIs64Bit =
36342 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
36343 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
36344 MFI->setRestoreBasePointer(MF);
36345
36346 Register FP = RI.getFrameRegister(*MF);
36347 Register BP = RI.getBaseRegister();
36348 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
36349 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
36352 } else {
36353 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
36355 }
36356
36357 // IReg is used as an index in a memory operand and therefore can't be SP
36358 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
36359 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
36360 Subtarget.is64Bit() ? 8 : 4);
36361 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
36362 .addReg(IReg)
36363 .addImm(LPadList.size());
36364 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
36365 .addMBB(TrapBB)
36367
36368 if (Subtarget.is64Bit()) {
36369 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36370 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
36371
36372 // leaq .LJTI0_0(%rip), BReg
36373 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
36374 .addReg(X86::RIP)
36375 .addImm(1)
36376 .addReg(0)
36377 .addJumpTableIndex(MJTI)
36378 .addReg(0);
36379 // movzx IReg64, IReg
36380 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
36381 .addImm(0)
36382 .addReg(IReg)
36383 .addImm(X86::sub_32bit);
36384
36385 switch (JTE) {
36387 // jmpq *(BReg,IReg64,8)
36388 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
36389 .addReg(BReg)
36390 .addImm(8)
36391 .addReg(IReg64)
36392 .addImm(0)
36393 .addReg(0);
36394 break;
36396 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
36397 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
36398 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36399
36400 // movl (BReg,IReg64,4), OReg
36401 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
36402 .addReg(BReg)
36403 .addImm(4)
36404 .addReg(IReg64)
36405 .addImm(0)
36406 .addReg(0);
36407 // movsx OReg64, OReg
36408 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
36409 .addReg(OReg);
36410 // addq BReg, OReg64, TReg
36411 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
36412 .addReg(OReg64)
36413 .addReg(BReg);
36414 // jmpq *TReg
36415 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
36416 break;
36417 }
36418 default:
36419 llvm_unreachable("Unexpected jump table encoding");
36420 }
36421 } else {
36422 // jmpl *.LJTI0_0(,IReg,4)
36423 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
36424 .addReg(0)
36425 .addImm(4)
36426 .addReg(IReg)
36427 .addJumpTableIndex(MJTI)
36428 .addReg(0);
36429 }
36430
36431 // Add the jump table entries as successors to the MBB.
36433 for (auto &LP : LPadList)
36434 if (SeenMBBs.insert(LP).second)
36435 DispContBB->addSuccessor(LP);
36436
36437 // N.B. the order the invoke BBs are processed in doesn't matter here.
36439 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
36440 for (MachineBasicBlock *MBB : InvokeBBs) {
36441 // Remove the landing pad successor from the invoke block and replace it
36442 // with the new dispatch block.
36443 // Keep a copy of Successors since it's modified inside the loop.
36445 MBB->succ_rend());
36446 // FIXME: Avoid quadratic complexity.
36447 for (auto *MBBS : Successors) {
36448 if (MBBS->isEHPad()) {
36449 MBB->removeSuccessor(MBBS);
36450 MBBLPads.push_back(MBBS);
36451 }
36452 }
36453
36454 MBB->addSuccessor(DispatchBB);
36455
36456 // Find the invoke call and mark all of the callee-saved registers as
36457 // 'implicit defined' so that they're spilled. This prevents code from
36458 // moving instructions to before the EH block, where they will never be
36459 // executed.
36460 for (auto &II : reverse(*MBB)) {
36461 if (!II.isCall())
36462 continue;
36463
36465 for (auto &MOp : II.operands())
36466 if (MOp.isReg())
36467 DefRegs[MOp.getReg()] = true;
36468
36469 MachineInstrBuilder MIB(*MF, &II);
36470 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
36471 unsigned Reg = SavedRegs[RegIdx];
36472 if (!DefRegs[Reg])
36474 }
36475
36476 break;
36477 }
36478 }
36479
36480 // Mark all former landing pads as non-landing pads. The dispatch is the only
36481 // landing pad now.
36482 for (auto &LP : MBBLPads)
36483 LP->setIsEHPad(false);
36484
36485 // The instruction is gone now.
36486 MI.eraseFromParent();
36487 return BB;
36488}
36489
36491X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
36492 MachineBasicBlock *BB) const {
36493 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
36494 // calls may require proper stack alignment.
36495 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36496 const MIMetadata MIMD(MI);
36497 MachineFunction &MF = *BB->getParent();
36498
36499 // Emit CALLSEQ_START right before the instruction.
36500 MF.getFrameInfo().setAdjustsStack(true);
36501 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
36502 MachineInstrBuilder CallseqStart =
36503 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
36504 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
36505
36506 // Emit CALLSEQ_END right after the instruction.
36507 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
36508 MachineInstrBuilder CallseqEnd =
36509 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
36510 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
36511
36512 return BB;
36513}
36514
36517 MachineBasicBlock *BB) const {
36518 MachineFunction *MF = BB->getParent();
36519 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36520 const MIMetadata MIMD(MI);
36521
36522 auto TMMImmToTMMReg = [](unsigned Imm) {
36523 assert (Imm < 8 && "Illegal tmm index");
36524 return X86::TMM0 + Imm;
36525 };
36526 switch (MI.getOpcode()) {
36527 default: llvm_unreachable("Unexpected instr type to insert");
36528 case X86::TLS_addr32:
36529 case X86::TLS_addr64:
36530 case X86::TLS_addrX32:
36531 case X86::TLS_base_addr32:
36532 case X86::TLS_base_addr64:
36533 case X86::TLS_base_addrX32:
36534 case X86::TLS_desc32:
36535 case X86::TLS_desc64:
36536 return EmitLoweredTLSAddr(MI, BB);
36537 case X86::INDIRECT_THUNK_CALL32:
36538 case X86::INDIRECT_THUNK_CALL64:
36539 case X86::INDIRECT_THUNK_TCRETURN32:
36540 case X86::INDIRECT_THUNK_TCRETURN64:
36541 return EmitLoweredIndirectThunk(MI, BB);
36542 case X86::CATCHRET:
36543 return EmitLoweredCatchRet(MI, BB);
36544 case X86::SEG_ALLOCA_32:
36545 case X86::SEG_ALLOCA_64:
36546 return EmitLoweredSegAlloca(MI, BB);
36547 case X86::PROBED_ALLOCA_32:
36548 case X86::PROBED_ALLOCA_64:
36549 return EmitLoweredProbedAlloca(MI, BB);
36550 case X86::TLSCall_32:
36551 case X86::TLSCall_64:
36552 return EmitLoweredTLSCall(MI, BB);
36553 case X86::CMOV_FR16:
36554 case X86::CMOV_FR16X:
36555 case X86::CMOV_FR32:
36556 case X86::CMOV_FR32X:
36557 case X86::CMOV_FR64:
36558 case X86::CMOV_FR64X:
36559 case X86::CMOV_GR8:
36560 case X86::CMOV_GR16:
36561 case X86::CMOV_GR32:
36562 case X86::CMOV_RFP32:
36563 case X86::CMOV_RFP64:
36564 case X86::CMOV_RFP80:
36565 case X86::CMOV_VR64:
36566 case X86::CMOV_VR128:
36567 case X86::CMOV_VR128X:
36568 case X86::CMOV_VR256:
36569 case X86::CMOV_VR256X:
36570 case X86::CMOV_VR512:
36571 case X86::CMOV_VK1:
36572 case X86::CMOV_VK2:
36573 case X86::CMOV_VK4:
36574 case X86::CMOV_VK8:
36575 case X86::CMOV_VK16:
36576 case X86::CMOV_VK32:
36577 case X86::CMOV_VK64:
36578 return EmitLoweredSelect(MI, BB);
36579
36580 case X86::FP80_ADDr:
36581 case X86::FP80_ADDm32: {
36582 // Change the floating point control register to use double extended
36583 // precision when performing the addition.
36584 int OrigCWFrameIdx =
36585 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36586 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36587 OrigCWFrameIdx);
36588
36589 // Load the old value of the control word...
36590 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36591 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36592 OrigCWFrameIdx);
36593
36594 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
36595 // precision.
36596 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36597 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36598 .addReg(OldCW, RegState::Kill)
36599 .addImm(0x300);
36600
36601 // Extract to 16 bits.
36602 Register NewCW16 =
36603 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36604 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36605 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36606
36607 // Prepare memory for FLDCW.
36608 int NewCWFrameIdx =
36609 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36610 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36611 NewCWFrameIdx)
36612 .addReg(NewCW16, RegState::Kill);
36613
36614 // Reload the modified control word now...
36615 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36616 NewCWFrameIdx);
36617
36618 // Do the addition.
36619 if (MI.getOpcode() == X86::FP80_ADDr) {
36620 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
36621 .add(MI.getOperand(0))
36622 .add(MI.getOperand(1))
36623 .add(MI.getOperand(2));
36624 } else {
36625 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
36626 .add(MI.getOperand(0))
36627 .add(MI.getOperand(1))
36628 .add(MI.getOperand(2))
36629 .add(MI.getOperand(3))
36630 .add(MI.getOperand(4))
36631 .add(MI.getOperand(5))
36632 .add(MI.getOperand(6));
36633 }
36634
36635 // Reload the original control word now.
36636 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36637 OrigCWFrameIdx);
36638
36639 MI.eraseFromParent(); // The pseudo instruction is gone now.
36640 return BB;
36641 }
36642
36643 case X86::FP32_TO_INT16_IN_MEM:
36644 case X86::FP32_TO_INT32_IN_MEM:
36645 case X86::FP32_TO_INT64_IN_MEM:
36646 case X86::FP64_TO_INT16_IN_MEM:
36647 case X86::FP64_TO_INT32_IN_MEM:
36648 case X86::FP64_TO_INT64_IN_MEM:
36649 case X86::FP80_TO_INT16_IN_MEM:
36650 case X86::FP80_TO_INT32_IN_MEM:
36651 case X86::FP80_TO_INT64_IN_MEM: {
36652 // Change the floating point control register to use "round towards zero"
36653 // mode when truncating to an integer value.
36654 int OrigCWFrameIdx =
36655 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36656 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36657 OrigCWFrameIdx);
36658
36659 // Load the old value of the control word...
36660 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36661 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36662 OrigCWFrameIdx);
36663
36664 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
36665 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36666 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36667 .addReg(OldCW, RegState::Kill).addImm(0xC00);
36668
36669 // Extract to 16 bits.
36670 Register NewCW16 =
36671 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36672 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36673 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36674
36675 // Prepare memory for FLDCW.
36676 int NewCWFrameIdx =
36677 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36678 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36679 NewCWFrameIdx)
36680 .addReg(NewCW16, RegState::Kill);
36681
36682 // Reload the modified control word now...
36683 addFrameReference(BuildMI(*BB, MI, MIMD,
36684 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
36685
36686 // Get the X86 opcode to use.
36687 unsigned Opc;
36688 switch (MI.getOpcode()) {
36689 // clang-format off
36690 default: llvm_unreachable("illegal opcode!");
36691 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
36692 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
36693 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
36694 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
36695 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
36696 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
36697 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
36698 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
36699 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
36700 // clang-format on
36701 }
36702
36704 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
36705 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
36706
36707 // Reload the original control word now.
36708 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36709 OrigCWFrameIdx);
36710
36711 MI.eraseFromParent(); // The pseudo instruction is gone now.
36712 return BB;
36713 }
36714
36715 // xbegin
36716 case X86::XBEGIN:
36717 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
36718
36719 case X86::VAARG_64:
36720 case X86::VAARG_X32:
36721 return EmitVAARGWithCustomInserter(MI, BB);
36722
36723 case X86::EH_SjLj_SetJmp32:
36724 case X86::EH_SjLj_SetJmp64:
36725 return emitEHSjLjSetJmp(MI, BB);
36726
36727 case X86::EH_SjLj_LongJmp32:
36728 case X86::EH_SjLj_LongJmp64:
36729 return emitEHSjLjLongJmp(MI, BB);
36730
36731 case X86::Int_eh_sjlj_setup_dispatch:
36732 return EmitSjLjDispatchBlock(MI, BB);
36733
36734 case TargetOpcode::STATEPOINT:
36735 // As an implementation detail, STATEPOINT shares the STACKMAP format at
36736 // this point in the process. We diverge later.
36737 return emitPatchPoint(MI, BB);
36738
36739 case TargetOpcode::STACKMAP:
36740 case TargetOpcode::PATCHPOINT:
36741 return emitPatchPoint(MI, BB);
36742
36743 case TargetOpcode::PATCHABLE_EVENT_CALL:
36744 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
36745 return emitPatchableEventCall(MI, BB);
36746
36747 case X86::LCMPXCHG8B: {
36748 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36749 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
36750 // requires a memory operand. If it happens that current architecture is
36751 // i686 and for current function we need a base pointer
36752 // - which is ESI for i686 - register allocator would not be able to
36753 // allocate registers for an address in form of X(%reg, %reg, Y)
36754 // - there never would be enough unreserved registers during regalloc
36755 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
36756 // We are giving a hand to register allocator by precomputing the address in
36757 // a new vreg using LEA.
36758
36759 // If it is not i686 or there is no base pointer - nothing to do here.
36760 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
36761 return BB;
36762
36763 // Even though this code does not necessarily needs the base pointer to
36764 // be ESI, we check for that. The reason: if this assert fails, there are
36765 // some changes happened in the compiler base pointer handling, which most
36766 // probably have to be addressed somehow here.
36767 assert(TRI->getBaseRegister() == X86::ESI &&
36768 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
36769 "base pointer in mind");
36770
36772 MVT SPTy = getPointerTy(MF->getDataLayout());
36773 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
36774 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
36775
36777 // Regalloc does not need any help when the memory operand of CMPXCHG8B
36778 // does not use index register.
36779 if (AM.IndexReg == X86::NoRegister)
36780 return BB;
36781
36782 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
36783 // four operand definitions that are E[ABCD] registers. We skip them and
36784 // then insert the LEA.
36785 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
36786 while (RMBBI != BB->rend() &&
36787 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
36788 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
36789 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
36790 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
36791 ++RMBBI;
36792 }
36795 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
36796
36797 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
36798
36799 return BB;
36800 }
36801 case X86::LCMPXCHG16B_NO_RBX: {
36802 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36803 Register BasePtr = TRI->getBaseRegister();
36804 if (TRI->hasBasePointer(*MF) &&
36805 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
36806 if (!BB->isLiveIn(BasePtr))
36807 BB->addLiveIn(BasePtr);
36808 // Save RBX into a virtual register.
36809 Register SaveRBX =
36810 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36811 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36812 .addReg(X86::RBX);
36813 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36815 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
36816 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36817 MIB.add(MI.getOperand(Idx));
36818 MIB.add(MI.getOperand(X86::AddrNumOperands));
36819 MIB.addReg(SaveRBX);
36820 } else {
36821 // Simple case, just copy the virtual register to RBX.
36822 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
36823 .add(MI.getOperand(X86::AddrNumOperands));
36825 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
36826 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36827 MIB.add(MI.getOperand(Idx));
36828 }
36829 MI.eraseFromParent();
36830 return BB;
36831 }
36832 case X86::MWAITX: {
36833 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36834 Register BasePtr = TRI->getBaseRegister();
36835 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
36836 // If no need to save the base pointer, we generate MWAITXrrr,
36837 // else we generate pseudo MWAITX_SAVE_RBX.
36838 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
36839 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36840 .addReg(MI.getOperand(0).getReg());
36841 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36842 .addReg(MI.getOperand(1).getReg());
36843 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
36844 .addReg(MI.getOperand(2).getReg());
36845 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
36846 MI.eraseFromParent();
36847 } else {
36848 if (!BB->isLiveIn(BasePtr)) {
36849 BB->addLiveIn(BasePtr);
36850 }
36851 // Parameters can be copied into ECX and EAX but not EBX yet.
36852 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36853 .addReg(MI.getOperand(0).getReg());
36854 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36855 .addReg(MI.getOperand(1).getReg());
36856 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
36857 // Save RBX into a virtual register.
36858 Register SaveRBX =
36859 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36860 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36861 .addReg(X86::RBX);
36862 // Generate mwaitx pseudo.
36863 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36864 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
36865 .addDef(Dst) // Destination tied in with SaveRBX.
36866 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
36867 .addUse(SaveRBX); // Save of base pointer.
36868 MI.eraseFromParent();
36869 }
36870 return BB;
36871 }
36872 case TargetOpcode::PREALLOCATED_SETUP: {
36873 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
36874 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36875 MFI->setHasPreallocatedCall(true);
36876 int64_t PreallocatedId = MI.getOperand(0).getImm();
36877 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
36878 assert(StackAdjustment != 0 && "0 stack adjustment");
36879 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
36880 << StackAdjustment << "\n");
36881 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
36882 .addReg(X86::ESP)
36883 .addImm(StackAdjustment);
36884 MI.eraseFromParent();
36885 return BB;
36886 }
36887 case TargetOpcode::PREALLOCATED_ARG: {
36888 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
36889 int64_t PreallocatedId = MI.getOperand(1).getImm();
36890 int64_t ArgIdx = MI.getOperand(2).getImm();
36891 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36892 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
36893 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
36894 << ", arg offset " << ArgOffset << "\n");
36895 // stack pointer + offset
36896 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
36897 MI.getOperand(0).getReg()),
36898 X86::ESP, false, ArgOffset);
36899 MI.eraseFromParent();
36900 return BB;
36901 }
36902 case X86::PTDPBSSD:
36903 case X86::PTDPBSUD:
36904 case X86::PTDPBUSD:
36905 case X86::PTDPBUUD:
36906 case X86::PTDPBF16PS:
36907 case X86::PTDPFP16PS: {
36908 unsigned Opc;
36909 switch (MI.getOpcode()) {
36910 // clang-format off
36911 default: llvm_unreachable("illegal opcode!");
36912 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
36913 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
36914 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
36915 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
36916 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
36917 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
36918 // clang-format on
36919 }
36920
36921 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36922 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36923 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36924 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36925 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36926
36927 MI.eraseFromParent(); // The pseudo is gone now.
36928 return BB;
36929 }
36930 case X86::PTILEZERO: {
36931 unsigned Imm = MI.getOperand(0).getImm();
36932 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
36933 MI.eraseFromParent(); // The pseudo is gone now.
36934 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36936 return BB;
36937 }
36938 case X86::PTILEZEROV: {
36939 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36941 return BB;
36942 }
36943 case X86::PTILELOADD:
36944 case X86::PTILELOADDT1:
36945 case X86::PTILESTORED: {
36946 unsigned Opc;
36947 switch (MI.getOpcode()) {
36948 default: llvm_unreachable("illegal opcode!");
36949#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
36950 case X86::PTILELOADD:
36951 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
36952 break;
36953 case X86::PTILELOADDT1:
36954 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
36955 break;
36956 case X86::PTILESTORED:
36957 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
36958 break;
36959#undef GET_EGPR_IF_ENABLED
36960 }
36961
36962 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36963 unsigned CurOp = 0;
36964 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
36965 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36967
36968 MIB.add(MI.getOperand(CurOp++)); // base
36969 MIB.add(MI.getOperand(CurOp++)); // scale
36970 MIB.add(MI.getOperand(CurOp++)); // index -- stride
36971 MIB.add(MI.getOperand(CurOp++)); // displacement
36972 MIB.add(MI.getOperand(CurOp++)); // segment
36973
36974 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
36975 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36977
36978 MI.eraseFromParent(); // The pseudo is gone now.
36979 return BB;
36980 }
36981 case X86::PTCMMIMFP16PS:
36982 case X86::PTCMMRLFP16PS: {
36983 const MIMetadata MIMD(MI);
36984 unsigned Opc;
36985 switch (MI.getOpcode()) {
36986 // clang-format off
36987 default: llvm_unreachable("Unexpected instruction!");
36988 case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
36989 case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
36990 // clang-format on
36991 }
36992 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36993 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36994 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36995 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36996 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36997 MI.eraseFromParent(); // The pseudo is gone now.
36998 return BB;
36999 }
37000 }
37001}
37002
37003//===----------------------------------------------------------------------===//
37004// X86 Optimization Hooks
37005//===----------------------------------------------------------------------===//
37006
37007bool
37009 const APInt &DemandedBits,
37010 const APInt &DemandedElts,
37011 TargetLoweringOpt &TLO) const {
37012 EVT VT = Op.getValueType();
37013 unsigned Opcode = Op.getOpcode();
37014 unsigned EltSize = VT.getScalarSizeInBits();
37015
37016 if (VT.isVector()) {
37017 // If the constant is only all signbits in the active bits, then we should
37018 // extend it to the entire constant to allow it act as a boolean constant
37019 // vector.
37020 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
37021 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
37022 return false;
37023 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
37024 if (!DemandedElts[i] || V.getOperand(i).isUndef())
37025 continue;
37026 const APInt &Val = V.getConstantOperandAPInt(i);
37027 if (Val.getBitWidth() > Val.getNumSignBits() &&
37028 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
37029 return true;
37030 }
37031 return false;
37032 };
37033 // For vectors - if we have a constant, then try to sign extend.
37034 // TODO: Handle AND cases.
37035 unsigned ActiveBits = DemandedBits.getActiveBits();
37036 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
37037 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
37038 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
37039 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
37040 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
37042 SDValue NewC =
37044 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
37045 SDValue NewOp =
37046 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
37047 return TLO.CombineTo(Op, NewOp);
37048 }
37049 return false;
37050 }
37051
37052 // Only optimize Ands to prevent shrinking a constant that could be
37053 // matched by movzx.
37054 if (Opcode != ISD::AND)
37055 return false;
37056
37057 // Make sure the RHS really is a constant.
37058 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
37059 if (!C)
37060 return false;
37061
37062 const APInt &Mask = C->getAPIntValue();
37063
37064 // Clear all non-demanded bits initially.
37065 APInt ShrunkMask = Mask & DemandedBits;
37066
37067 // Find the width of the shrunk mask.
37068 unsigned Width = ShrunkMask.getActiveBits();
37069
37070 // If the mask is all 0s there's nothing to do here.
37071 if (Width == 0)
37072 return false;
37073
37074 // Find the next power of 2 width, rounding up to a byte.
37075 Width = llvm::bit_ceil(std::max(Width, 8U));
37076 // Truncate the width to size to handle illegal types.
37077 Width = std::min(Width, EltSize);
37078
37079 // Calculate a possible zero extend mask for this constant.
37080 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
37081
37082 // If we aren't changing the mask, just return true to keep it and prevent
37083 // the caller from optimizing.
37084 if (ZeroExtendMask == Mask)
37085 return true;
37086
37087 // Make sure the new mask can be represented by a combination of mask bits
37088 // and non-demanded bits.
37089 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
37090 return false;
37091
37092 // Replace the constant with the zero extend mask.
37093 SDLoc DL(Op);
37094 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
37095 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
37096 return TLO.CombineTo(Op, NewOp);
37097}
37098
37100 KnownBits &Known,
37101 const APInt &DemandedElts,
37102 const SelectionDAG &DAG, unsigned Depth) {
37103 KnownBits Known2;
37104 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
37105 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
37106 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
37107 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
37108 Known = KnownBits::abdu(Known, Known2).zext(16);
37109 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
37110 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
37111 Known, Known);
37112 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
37113 Known, Known);
37114 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
37115 Known, Known);
37116 Known = Known.zext(64);
37117}
37118
37120 KnownBits &Known,
37121 const APInt &DemandedElts,
37122 const SelectionDAG &DAG,
37123 unsigned Depth) {
37124 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
37125
37126 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
37127 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
37128 APInt DemandedLoElts =
37129 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
37130 APInt DemandedHiElts =
37131 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
37132 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
37133 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
37134 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
37135 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
37136 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
37137 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
37138 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/false,
37139 /*NUW=*/false, Lo, Hi);
37140}
37141
37143 KnownBits &Known,
37144 const APInt &DemandedElts,
37145 const SelectionDAG &DAG,
37146 unsigned Depth) {
37147 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
37148
37149 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
37150 // pairs.
37151 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
37152 APInt DemandedLoElts =
37153 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
37154 APInt DemandedHiElts =
37155 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
37156 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
37157 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
37158 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
37159 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
37160 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
37161 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
37162 Known = KnownBits::sadd_sat(Lo, Hi);
37163}
37164
37166 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
37167 const SelectionDAG &DAG,
37168 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
37169 KnownBitsFunc) {
37170 APInt DemandedEltsLHS, DemandedEltsRHS;
37171 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
37172 DemandedElts, DemandedEltsLHS,
37173 DemandedEltsRHS);
37174
37175 const auto ComputeForSingleOpFunc =
37176 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
37177 return KnownBitsFunc(
37178 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
37179 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
37180 };
37181
37182 if (DemandedEltsRHS.isZero())
37183 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
37184 if (DemandedEltsLHS.isZero())
37185 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
37186
37187 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
37188 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
37189}
37190
37192 KnownBits &Known,
37193 const APInt &DemandedElts,
37194 const SelectionDAG &DAG,
37195 unsigned Depth) const {
37196 unsigned BitWidth = Known.getBitWidth();
37197 unsigned NumElts = DemandedElts.getBitWidth();
37198 unsigned Opc = Op.getOpcode();
37199 EVT VT = Op.getValueType();
37200 assert((Opc >= ISD::BUILTIN_OP_END ||
37201 Opc == ISD::INTRINSIC_WO_CHAIN ||
37202 Opc == ISD::INTRINSIC_W_CHAIN ||
37203 Opc == ISD::INTRINSIC_VOID) &&
37204 "Should use MaskedValueIsZero if you don't know whether Op"
37205 " is a target node!");
37206
37207 Known.resetAll();
37208 switch (Opc) {
37209 default: break;
37210 case X86ISD::MUL_IMM: {
37211 KnownBits Known2;
37212 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37213 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37214 Known = KnownBits::mul(Known, Known2);
37215 break;
37216 }
37217 case X86ISD::SETCC:
37218 Known.Zero.setBitsFrom(1);
37219 break;
37220 case X86ISD::MOVMSK: {
37221 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
37222 Known.Zero.setBitsFrom(NumLoBits);
37223 break;
37224 }
37225 case X86ISD::PEXTRB:
37226 case X86ISD::PEXTRW: {
37227 SDValue Src = Op.getOperand(0);
37228 EVT SrcVT = Src.getValueType();
37229 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
37230 Op.getConstantOperandVal(1));
37231 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
37232 Known = Known.anyextOrTrunc(BitWidth);
37233 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
37234 break;
37235 }
37236 case X86ISD::VSRAI:
37237 case X86ISD::VSHLI:
37238 case X86ISD::VSRLI: {
37239 unsigned ShAmt = Op.getConstantOperandVal(1);
37240 if (ShAmt >= VT.getScalarSizeInBits()) {
37241 // Out of range logical bit shifts are guaranteed to be zero.
37242 // Out of range arithmetic bit shifts splat the sign bit.
37243 if (Opc != X86ISD::VSRAI) {
37244 Known.setAllZero();
37245 break;
37246 }
37247
37248 ShAmt = VT.getScalarSizeInBits() - 1;
37249 }
37250
37251 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37252 if (Opc == X86ISD::VSHLI) {
37253 Known.Zero <<= ShAmt;
37254 Known.One <<= ShAmt;
37255 // Low bits are known zero.
37256 Known.Zero.setLowBits(ShAmt);
37257 } else if (Opc == X86ISD::VSRLI) {
37258 Known.Zero.lshrInPlace(ShAmt);
37259 Known.One.lshrInPlace(ShAmt);
37260 // High bits are known zero.
37261 Known.Zero.setHighBits(ShAmt);
37262 } else {
37263 Known.Zero.ashrInPlace(ShAmt);
37264 Known.One.ashrInPlace(ShAmt);
37265 }
37266 break;
37267 }
37268 case X86ISD::PACKUS: {
37269 // PACKUS is just a truncation if the upper half is zero.
37270 APInt DemandedLHS, DemandedRHS;
37271 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
37272
37273 Known.One = APInt::getAllOnes(BitWidth * 2);
37274 Known.Zero = APInt::getAllOnes(BitWidth * 2);
37275
37276 KnownBits Known2;
37277 if (!!DemandedLHS) {
37278 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
37279 Known = Known.intersectWith(Known2);
37280 }
37281 if (!!DemandedRHS) {
37282 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
37283 Known = Known.intersectWith(Known2);
37284 }
37285
37286 if (Known.countMinLeadingZeros() < BitWidth)
37287 Known.resetAll();
37288 Known = Known.trunc(BitWidth);
37289 break;
37290 }
37291 case X86ISD::PSHUFB: {
37292 SDValue Src = Op.getOperand(0);
37293 SDValue Idx = Op.getOperand(1);
37294
37295 // If the index vector is never negative (MSB is zero), then all elements
37296 // come from the source vector. This is useful for cases where
37297 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
37298 // below will handle the more common constant shuffle mask case.
37299 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
37300 if (KnownIdx.isNonNegative())
37301 Known = DAG.computeKnownBits(Src, Depth + 1);
37302 break;
37303 }
37304 case X86ISD::VBROADCAST: {
37305 SDValue Src = Op.getOperand(0);
37306 if (!Src.getSimpleValueType().isVector()) {
37307 Known = DAG.computeKnownBits(Src, Depth + 1);
37308 return;
37309 }
37310 break;
37311 }
37312 case X86ISD::AND: {
37313 if (Op.getResNo() == 0) {
37314 KnownBits Known2;
37315 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37316 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37317 Known &= Known2;
37318 }
37319 break;
37320 }
37321 case X86ISD::ANDNP: {
37322 KnownBits Known2;
37323 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37324 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37325
37326 // ANDNP = (~X & Y);
37327 Known.One &= Known2.Zero;
37328 Known.Zero |= Known2.One;
37329 break;
37330 }
37331 case X86ISD::FOR: {
37332 KnownBits Known2;
37333 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37334 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37335
37336 Known |= Known2;
37337 break;
37338 }
37339 case X86ISD::PSADBW: {
37340 SDValue LHS = Op.getOperand(0);
37341 SDValue RHS = Op.getOperand(1);
37342 assert(VT.getScalarType() == MVT::i64 &&
37343 LHS.getValueType() == RHS.getValueType() &&
37344 LHS.getValueType().getScalarType() == MVT::i8 &&
37345 "Unexpected PSADBW types");
37346 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37347 break;
37348 }
37349 case X86ISD::PCMPGT:
37350 case X86ISD::PCMPEQ: {
37351 KnownBits KnownLhs =
37352 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37353 KnownBits KnownRhs =
37354 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37355 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
37356 ? KnownBits::eq(KnownLhs, KnownRhs)
37357 : KnownBits::sgt(KnownLhs, KnownRhs);
37358 if (Res) {
37359 if (*Res)
37360 Known.setAllOnes();
37361 else
37362 Known.setAllZero();
37363 }
37364 break;
37365 }
37366 case X86ISD::VPMADDWD: {
37367 SDValue LHS = Op.getOperand(0);
37368 SDValue RHS = Op.getOperand(1);
37369 assert(VT.getVectorElementType() == MVT::i32 &&
37370 LHS.getValueType() == RHS.getValueType() &&
37371 LHS.getValueType().getVectorElementType() == MVT::i16 &&
37372 "Unexpected PMADDWD types");
37373 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
37374 break;
37375 }
37376 case X86ISD::VPMADDUBSW: {
37377 SDValue LHS = Op.getOperand(0);
37378 SDValue RHS = Op.getOperand(1);
37379 assert(VT.getVectorElementType() == MVT::i16 &&
37380 LHS.getValueType() == RHS.getValueType() &&
37381 LHS.getValueType().getVectorElementType() == MVT::i8 &&
37382 "Unexpected PMADDUBSW types");
37383 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37384 break;
37385 }
37386 case X86ISD::PMULUDQ: {
37387 KnownBits Known2;
37388 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37389 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37390
37391 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
37392 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
37393 Known = KnownBits::mul(Known, Known2);
37394 break;
37395 }
37396 case X86ISD::CMOV: {
37397 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
37398 // If we don't know any bits, early out.
37399 if (Known.isUnknown())
37400 break;
37401 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
37402
37403 // Only known if known in both the LHS and RHS.
37404 Known = Known.intersectWith(Known2);
37405 break;
37406 }
37407 case X86ISD::BEXTR:
37408 case X86ISD::BEXTRI: {
37409 SDValue Op0 = Op.getOperand(0);
37410 SDValue Op1 = Op.getOperand(1);
37411
37412 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
37413 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
37414 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
37415
37416 // If the length is 0, the result is 0.
37417 if (Length == 0) {
37418 Known.setAllZero();
37419 break;
37420 }
37421
37422 if ((Shift + Length) <= BitWidth) {
37423 Known = DAG.computeKnownBits(Op0, Depth + 1);
37424 Known = Known.extractBits(Length, Shift);
37425 Known = Known.zextOrTrunc(BitWidth);
37426 }
37427 }
37428 break;
37429 }
37430 case X86ISD::PDEP: {
37431 KnownBits Known2;
37432 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37433 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37434 // Zeros are retained from the mask operand. But not ones.
37435 Known.One.clearAllBits();
37436 // The result will have at least as many trailing zeros as the non-mask
37437 // operand since bits can only map to the same or higher bit position.
37438 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
37439 break;
37440 }
37441 case X86ISD::PEXT: {
37442 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37443 // The result has as many leading zeros as the number of zeroes in the mask.
37444 unsigned Count = Known.Zero.popcount();
37445 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
37446 Known.One.clearAllBits();
37447 break;
37448 }
37449 case X86ISD::VTRUNC:
37450 case X86ISD::VTRUNCS:
37451 case X86ISD::VTRUNCUS:
37452 case X86ISD::CVTSI2P:
37453 case X86ISD::CVTUI2P:
37454 case X86ISD::CVTP2SI:
37455 case X86ISD::CVTP2UI:
37456 case X86ISD::MCVTP2SI:
37457 case X86ISD::MCVTP2UI:
37458 case X86ISD::CVTTP2SI:
37459 case X86ISD::CVTTP2UI:
37460 case X86ISD::MCVTTP2SI:
37461 case X86ISD::MCVTTP2UI:
37462 case X86ISD::MCVTSI2P:
37463 case X86ISD::MCVTUI2P:
37464 case X86ISD::VFPROUND:
37465 case X86ISD::VMFPROUND:
37466 case X86ISD::CVTPS2PH:
37467 case X86ISD::MCVTPS2PH: {
37468 // Truncations/Conversions - upper elements are known zero.
37469 EVT SrcVT = Op.getOperand(0).getValueType();
37470 if (SrcVT.isVector()) {
37471 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37472 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37473 Known.setAllZero();
37474 }
37475 break;
37476 }
37483 // Strict Conversions - upper elements are known zero.
37484 EVT SrcVT = Op.getOperand(1).getValueType();
37485 if (SrcVT.isVector()) {
37486 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37487 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37488 Known.setAllZero();
37489 }
37490 break;
37491 }
37492 case X86ISD::MOVQ2DQ: {
37493 // Move from MMX to XMM. Upper half of XMM should be 0.
37494 if (DemandedElts.countr_zero() >= (NumElts / 2))
37495 Known.setAllZero();
37496 break;
37497 }
37499 APInt UndefElts;
37500 SmallVector<APInt, 16> EltBits;
37501 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
37502 /*AllowWholeUndefs*/ false,
37503 /*AllowPartialUndefs*/ false)) {
37504 Known.Zero.setAllBits();
37505 Known.One.setAllBits();
37506 for (unsigned I = 0; I != NumElts; ++I) {
37507 if (!DemandedElts[I])
37508 continue;
37509 if (UndefElts[I]) {
37510 Known.resetAll();
37511 break;
37512 }
37513 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
37514 Known = Known.intersectWith(Known2);
37515 }
37516 return;
37517 }
37518 break;
37519 }
37520 case X86ISD::HADD:
37521 case X86ISD::HSUB: {
37523 Op, DemandedElts, Depth, DAG,
37524 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
37526 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
37527 KnownLHS, KnownRHS);
37528 });
37529 break;
37530 }
37532 switch (Op->getConstantOperandVal(0)) {
37533 case Intrinsic::x86_sse2_pmadd_wd:
37534 case Intrinsic::x86_avx2_pmadd_wd:
37535 case Intrinsic::x86_avx512_pmaddw_d_512: {
37536 SDValue LHS = Op.getOperand(1);
37537 SDValue RHS = Op.getOperand(2);
37538 assert(VT.getScalarType() == MVT::i32 &&
37539 LHS.getValueType() == RHS.getValueType() &&
37540 LHS.getValueType().getScalarType() == MVT::i16 &&
37541 "Unexpected PMADDWD types");
37542 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
37543 break;
37544 }
37545 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
37546 case Intrinsic::x86_avx2_pmadd_ub_sw:
37547 case Intrinsic::x86_avx512_pmaddubs_w_512: {
37548 SDValue LHS = Op.getOperand(1);
37549 SDValue RHS = Op.getOperand(2);
37550 assert(VT.getScalarType() == MVT::i16 &&
37551 LHS.getValueType() == RHS.getValueType() &&
37552 LHS.getValueType().getScalarType() == MVT::i8 &&
37553 "Unexpected PMADDUBSW types");
37554 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37555 break;
37556 }
37557 case Intrinsic::x86_sse2_psad_bw:
37558 case Intrinsic::x86_avx2_psad_bw:
37559 case Intrinsic::x86_avx512_psad_bw_512: {
37560 SDValue LHS = Op.getOperand(1);
37561 SDValue RHS = Op.getOperand(2);
37562 assert(VT.getScalarType() == MVT::i64 &&
37563 LHS.getValueType() == RHS.getValueType() &&
37564 LHS.getValueType().getScalarType() == MVT::i8 &&
37565 "Unexpected PSADBW types");
37566 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37567 break;
37568 }
37569 }
37570 break;
37571 }
37572 }
37573
37574 // Handle target shuffles.
37575 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37576 if (isTargetShuffle(Opc)) {
37579 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37580 unsigned NumOps = Ops.size();
37581 unsigned NumElts = VT.getVectorNumElements();
37582 if (Mask.size() == NumElts) {
37583 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37584 Known.Zero.setAllBits(); Known.One.setAllBits();
37585 for (unsigned i = 0; i != NumElts; ++i) {
37586 if (!DemandedElts[i])
37587 continue;
37588 int M = Mask[i];
37589 if (M == SM_SentinelUndef) {
37590 // For UNDEF elements, we don't know anything about the common state
37591 // of the shuffle result.
37592 Known.resetAll();
37593 break;
37594 }
37595 if (M == SM_SentinelZero) {
37596 Known.One.clearAllBits();
37597 continue;
37598 }
37599 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37600 "Shuffle index out of range");
37601
37602 unsigned OpIdx = (unsigned)M / NumElts;
37603 unsigned EltIdx = (unsigned)M % NumElts;
37604 if (Ops[OpIdx].getValueType() != VT) {
37605 // TODO - handle target shuffle ops with different value types.
37606 Known.resetAll();
37607 break;
37608 }
37609 DemandedOps[OpIdx].setBit(EltIdx);
37610 }
37611 // Known bits are the values that are shared by every demanded element.
37612 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
37613 if (!DemandedOps[i])
37614 continue;
37615 KnownBits Known2 =
37616 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
37617 Known = Known.intersectWith(Known2);
37618 }
37619 }
37620 }
37621 }
37622}
37623
37625 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
37626 unsigned Depth) const {
37627 EVT VT = Op.getValueType();
37628 unsigned VTBits = VT.getScalarSizeInBits();
37629 unsigned Opcode = Op.getOpcode();
37630 switch (Opcode) {
37632 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
37633 return VTBits;
37634
37635 case X86ISD::VTRUNC: {
37636 SDValue Src = Op.getOperand(0);
37637 MVT SrcVT = Src.getSimpleValueType();
37638 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
37639 assert(VTBits < NumSrcBits && "Illegal truncation input type");
37640 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
37641 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
37642 if (Tmp > (NumSrcBits - VTBits))
37643 return Tmp - (NumSrcBits - VTBits);
37644 return 1;
37645 }
37646
37647 case X86ISD::PACKSS: {
37648 // PACKSS is just a truncation if the sign bits extend to the packed size.
37649 APInt DemandedLHS, DemandedRHS;
37650 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
37651 DemandedRHS);
37652
37653 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
37654 // patterns often used to compact vXi64 allsignbit patterns.
37655 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
37657 if (BC.getOpcode() == X86ISD::PACKSS &&
37658 BC.getScalarValueSizeInBits() == 16 &&
37659 V.getScalarValueSizeInBits() == 32) {
37662 if (BC0.getScalarValueSizeInBits() == 64 &&
37663 BC1.getScalarValueSizeInBits() == 64 &&
37664 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
37665 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
37666 return 32;
37667 }
37668 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
37669 };
37670
37671 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
37672 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
37673 if (!!DemandedLHS)
37674 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
37675 if (!!DemandedRHS)
37676 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
37677 unsigned Tmp = std::min(Tmp0, Tmp1);
37678 if (Tmp > (SrcBits - VTBits))
37679 return Tmp - (SrcBits - VTBits);
37680 return 1;
37681 }
37682
37683 case X86ISD::VBROADCAST: {
37684 SDValue Src = Op.getOperand(0);
37685 if (!Src.getSimpleValueType().isVector())
37686 return DAG.ComputeNumSignBits(Src, Depth + 1);
37687 break;
37688 }
37689
37690 case X86ISD::VSHLI: {
37691 SDValue Src = Op.getOperand(0);
37692 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
37693 if (ShiftVal.uge(VTBits))
37694 return VTBits; // Shifted all bits out --> zero.
37695 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37696 if (ShiftVal.uge(Tmp))
37697 return 1; // Shifted all sign bits out --> unknown.
37698 return Tmp - ShiftVal.getZExtValue();
37699 }
37700
37701 case X86ISD::VSRAI: {
37702 SDValue Src = Op.getOperand(0);
37703 APInt ShiftVal = Op.getConstantOperandAPInt(1);
37704 if (ShiftVal.uge(VTBits - 1))
37705 return VTBits; // Sign splat.
37706 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37707 ShiftVal += Tmp;
37708 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
37709 }
37710
37711 case X86ISD::FSETCC:
37712 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
37713 if (VT == MVT::f32 || VT == MVT::f64 ||
37714 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
37715 return VTBits;
37716 break;
37717
37718 case X86ISD::PCMPGT:
37719 case X86ISD::PCMPEQ:
37720 case X86ISD::CMPP:
37721 case X86ISD::VPCOM:
37722 case X86ISD::VPCOMU:
37723 // Vector compares return zero/all-bits result values.
37724 return VTBits;
37725
37726 case X86ISD::ANDNP: {
37727 unsigned Tmp0 =
37728 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
37729 if (Tmp0 == 1) return 1; // Early out.
37730 unsigned Tmp1 =
37731 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
37732 return std::min(Tmp0, Tmp1);
37733 }
37734
37735 case X86ISD::CMOV: {
37736 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
37737 if (Tmp0 == 1) return 1; // Early out.
37738 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
37739 return std::min(Tmp0, Tmp1);
37740 }
37741 }
37742
37743 // Handle target shuffles.
37744 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37745 if (isTargetShuffle(Opcode)) {
37748 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37749 unsigned NumOps = Ops.size();
37750 unsigned NumElts = VT.getVectorNumElements();
37751 if (Mask.size() == NumElts) {
37752 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37753 for (unsigned i = 0; i != NumElts; ++i) {
37754 if (!DemandedElts[i])
37755 continue;
37756 int M = Mask[i];
37757 if (M == SM_SentinelUndef) {
37758 // For UNDEF elements, we don't know anything about the common state
37759 // of the shuffle result.
37760 return 1;
37761 } else if (M == SM_SentinelZero) {
37762 // Zero = all sign bits.
37763 continue;
37764 }
37765 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37766 "Shuffle index out of range");
37767
37768 unsigned OpIdx = (unsigned)M / NumElts;
37769 unsigned EltIdx = (unsigned)M % NumElts;
37770 if (Ops[OpIdx].getValueType() != VT) {
37771 // TODO - handle target shuffle ops with different value types.
37772 return 1;
37773 }
37774 DemandedOps[OpIdx].setBit(EltIdx);
37775 }
37776 unsigned Tmp0 = VTBits;
37777 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
37778 if (!DemandedOps[i])
37779 continue;
37780 unsigned Tmp1 =
37781 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
37782 Tmp0 = std::min(Tmp0, Tmp1);
37783 }
37784 return Tmp0;
37785 }
37786 }
37787 }
37788
37789 // Fallback case.
37790 return 1;
37791}
37792
37794 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
37795 return N->getOperand(0);
37796 return N;
37797}
37798
37799// Helper to look for a normal load that can be narrowed into a vzload with the
37800// specified VT and memory VT. Returns SDValue() on failure.
37802 SelectionDAG &DAG) {
37803 // Can't if the load is volatile or atomic.
37804 if (!LN->isSimple())
37805 return SDValue();
37806
37807 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37808 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37809 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
37810 LN->getPointerInfo(), LN->getOriginalAlign(),
37811 LN->getMemOperand()->getFlags());
37812}
37813
37814// Attempt to match a combined shuffle mask against supported unary shuffle
37815// instructions.
37816// TODO: Investigate sharing more of this with shuffle lowering.
37817static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37818 bool AllowFloatDomain, bool AllowIntDomain,
37819 SDValue V1, const SelectionDAG &DAG,
37820 const X86Subtarget &Subtarget, unsigned &Shuffle,
37821 MVT &SrcVT, MVT &DstVT) {
37822 unsigned NumMaskElts = Mask.size();
37823 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
37824
37825 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
37826 if (Mask[0] == 0 &&
37827 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
37828 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
37830 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
37831 Shuffle = X86ISD::VZEXT_MOVL;
37832 if (MaskEltSize == 16)
37833 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37834 else
37835 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37836 return true;
37837 }
37838 }
37839
37840 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
37841 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
37842 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
37843 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
37844 unsigned MaxScale = 64 / MaskEltSize;
37845 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
37846 DAG.ComputeNumSignBits(V1) == MaskEltSize;
37847 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
37848 bool MatchAny = true;
37849 bool MatchZero = true;
37850 bool MatchSign = UseSign;
37851 unsigned NumDstElts = NumMaskElts / Scale;
37852 for (unsigned i = 0;
37853 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
37854 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
37855 MatchAny = MatchSign = MatchZero = false;
37856 break;
37857 }
37858 unsigned Pos = (i * Scale) + 1;
37859 unsigned Len = Scale - 1;
37860 MatchAny &= isUndefInRange(Mask, Pos, Len);
37861 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
37862 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
37863 }
37864 if (MatchAny || MatchSign || MatchZero) {
37865 assert((MatchSign || MatchZero) &&
37866 "Failed to match sext/zext but matched aext?");
37867 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
37868 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
37869 : MVT::getIntegerVT(MaskEltSize);
37870 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
37871
37872 Shuffle = unsigned(
37873 MatchAny ? ISD::ANY_EXTEND
37874 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
37875 if (SrcVT.getVectorNumElements() != NumDstElts)
37876 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
37877
37878 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
37879 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
37880 return true;
37881 }
37882 }
37883 }
37884
37885 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
37886 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
37887 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
37888 isUndefOrEqual(Mask[0], 0) &&
37889 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
37890 Shuffle = X86ISD::VZEXT_MOVL;
37891 if (MaskEltSize == 16)
37892 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37893 else
37894 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37895 return true;
37896 }
37897
37898 // Check if we have SSE3 which will let us use MOVDDUP etc. The
37899 // instructions are no slower than UNPCKLPD but has the option to
37900 // fold the input operand into even an unaligned memory load.
37901 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
37902 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
37903 Shuffle = X86ISD::MOVDDUP;
37904 SrcVT = DstVT = MVT::v2f64;
37905 return true;
37906 }
37907 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37908 Shuffle = X86ISD::MOVSLDUP;
37909 SrcVT = DstVT = MVT::v4f32;
37910 return true;
37911 }
37912 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
37913 Shuffle = X86ISD::MOVSHDUP;
37914 SrcVT = DstVT = MVT::v4f32;
37915 return true;
37916 }
37917 }
37918
37919 if (MaskVT.is256BitVector() && AllowFloatDomain) {
37920 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
37921 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37922 Shuffle = X86ISD::MOVDDUP;
37923 SrcVT = DstVT = MVT::v4f64;
37924 return true;
37925 }
37926 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37927 V1)) {
37928 Shuffle = X86ISD::MOVSLDUP;
37929 SrcVT = DstVT = MVT::v8f32;
37930 return true;
37931 }
37932 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
37933 V1)) {
37934 Shuffle = X86ISD::MOVSHDUP;
37935 SrcVT = DstVT = MVT::v8f32;
37936 return true;
37937 }
37938 }
37939
37940 if (MaskVT.is512BitVector() && AllowFloatDomain) {
37941 assert(Subtarget.hasAVX512() &&
37942 "AVX512 required for 512-bit vector shuffles");
37943 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37944 V1)) {
37945 Shuffle = X86ISD::MOVDDUP;
37946 SrcVT = DstVT = MVT::v8f64;
37947 return true;
37948 }
37950 MaskVT, Mask,
37951 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
37952 Shuffle = X86ISD::MOVSLDUP;
37953 SrcVT = DstVT = MVT::v16f32;
37954 return true;
37955 }
37957 MaskVT, Mask,
37958 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
37959 Shuffle = X86ISD::MOVSHDUP;
37960 SrcVT = DstVT = MVT::v16f32;
37961 return true;
37962 }
37963 }
37964
37965 return false;
37966}
37967
37968// Attempt to match a combined shuffle mask against supported unary immediate
37969// permute instructions.
37970// TODO: Investigate sharing more of this with shuffle lowering.
37972 const APInt &Zeroable,
37973 bool AllowFloatDomain, bool AllowIntDomain,
37974 const SelectionDAG &DAG,
37975 const X86Subtarget &Subtarget,
37976 unsigned &Shuffle, MVT &ShuffleVT,
37977 unsigned &PermuteImm) {
37978 unsigned NumMaskElts = Mask.size();
37979 unsigned InputSizeInBits = MaskVT.getSizeInBits();
37980 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
37981 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
37982 bool ContainsZeros = isAnyZero(Mask);
37983
37984 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
37985 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
37986 // Check for lane crossing permutes.
37987 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
37988 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
37989 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
37990 Shuffle = X86ISD::VPERMI;
37991 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
37992 PermuteImm = getV4X86ShuffleImm(Mask);
37993 return true;
37994 }
37995 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
37996 SmallVector<int, 4> RepeatedMask;
37997 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
37998 Shuffle = X86ISD::VPERMI;
37999 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
38000 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
38001 return true;
38002 }
38003 }
38004 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
38005 // VPERMILPD can permute with a non-repeating shuffle.
38006 Shuffle = X86ISD::VPERMILPI;
38007 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
38008 PermuteImm = 0;
38009 for (int i = 0, e = Mask.size(); i != e; ++i) {
38010 int M = Mask[i];
38011 if (M == SM_SentinelUndef)
38012 continue;
38013 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
38014 PermuteImm |= (M & 1) << i;
38015 }
38016 return true;
38017 }
38018 }
38019
38020 // We are checking for shuffle match or shift match. Loop twice so we can
38021 // order which we try and match first depending on target preference.
38022 for (unsigned Order = 0; Order < 2; ++Order) {
38023 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
38024 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
38025 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
38026 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
38027 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
38028 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
38029 SmallVector<int, 4> RepeatedMask;
38030 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
38031 // Narrow the repeated mask to create 32-bit element permutes.
38032 SmallVector<int, 4> WordMask = RepeatedMask;
38033 if (MaskScalarSizeInBits == 64)
38034 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
38035
38036 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
38037 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
38038 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
38039 PermuteImm = getV4X86ShuffleImm(WordMask);
38040 return true;
38041 }
38042 }
38043
38044 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
38045 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
38046 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38047 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38048 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
38049 SmallVector<int, 4> RepeatedMask;
38050 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
38051 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
38052 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
38053
38054 // PSHUFLW: permute lower 4 elements only.
38055 if (isUndefOrInRange(LoMask, 0, 4) &&
38056 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
38057 Shuffle = X86ISD::PSHUFLW;
38058 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
38059 PermuteImm = getV4X86ShuffleImm(LoMask);
38060 return true;
38061 }
38062
38063 // PSHUFHW: permute upper 4 elements only.
38064 if (isUndefOrInRange(HiMask, 4, 8) &&
38065 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
38066 // Offset the HiMask so that we can create the shuffle immediate.
38067 int OffsetHiMask[4];
38068 for (int i = 0; i != 4; ++i)
38069 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
38070
38071 Shuffle = X86ISD::PSHUFHW;
38072 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
38073 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
38074 return true;
38075 }
38076 }
38077 }
38078 } else {
38079 // Attempt to match against bit rotates.
38080 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
38081 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
38082 Subtarget.hasAVX512())) {
38083 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
38084 Subtarget, Mask);
38085 if (0 < RotateAmt) {
38086 Shuffle = X86ISD::VROTLI;
38087 PermuteImm = (unsigned)RotateAmt;
38088 return true;
38089 }
38090 }
38091 }
38092 // Attempt to match against byte/bit shifts.
38093 if (AllowIntDomain &&
38094 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38095 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38096 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38097 int ShiftAmt =
38098 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
38099 Zeroable, Subtarget);
38100 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
38101 32 <= ShuffleVT.getScalarSizeInBits())) {
38102 // Byte shifts can be slower so only match them on second attempt.
38103 if (Order == 0 &&
38104 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
38105 continue;
38106
38107 PermuteImm = (unsigned)ShiftAmt;
38108 return true;
38109 }
38110
38111 }
38112 }
38113
38114 return false;
38115}
38116
38117// Attempt to match a combined unary shuffle mask against supported binary
38118// shuffle instructions.
38119// TODO: Investigate sharing more of this with shuffle lowering.
38120static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
38121 bool AllowFloatDomain, bool AllowIntDomain,
38122 SDValue &V1, SDValue &V2, const SDLoc &DL,
38123 SelectionDAG &DAG, const X86Subtarget &Subtarget,
38124 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
38125 bool IsUnary) {
38126 unsigned NumMaskElts = Mask.size();
38127 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
38128 unsigned SizeInBits = MaskVT.getSizeInBits();
38129
38130 if (MaskVT.is128BitVector()) {
38131 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
38132 AllowFloatDomain) {
38133 V2 = V1;
38134 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
38135 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
38136 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
38137 return true;
38138 }
38139 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
38140 AllowFloatDomain) {
38141 V2 = V1;
38142 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
38143 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
38144 return true;
38145 }
38146 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
38147 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
38148 std::swap(V1, V2);
38149 Shuffle = X86ISD::MOVSD;
38150 SrcVT = DstVT = MVT::v2f64;
38151 return true;
38152 }
38153 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
38154 (AllowFloatDomain || !Subtarget.hasSSE41())) {
38155 Shuffle = X86ISD::MOVSS;
38156 SrcVT = DstVT = MVT::v4f32;
38157 return true;
38158 }
38159 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
38160 DAG) &&
38161 Subtarget.hasFP16()) {
38162 Shuffle = X86ISD::MOVSH;
38163 SrcVT = DstVT = MVT::v8f16;
38164 return true;
38165 }
38166 }
38167
38168 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
38169 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
38170 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
38171 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
38172 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
38173 Subtarget)) {
38174 DstVT = MaskVT;
38175 return true;
38176 }
38177 }
38178 // TODO: Can we handle this inside matchShuffleWithPACK?
38179 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
38180 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
38181 V1.getScalarValueSizeInBits() == 64 &&
38182 V2.getScalarValueSizeInBits() == 64) {
38183 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
38184 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
38185 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
38186 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
38187 SrcVT = MVT::v4i32;
38188 DstVT = MVT::v8i16;
38189 Shuffle = X86ISD::PACKUS;
38190 return true;
38191 }
38192 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
38193 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
38194 SrcVT = MVT::v8i16;
38195 DstVT = MVT::v16i8;
38196 Shuffle = X86ISD::PACKUS;
38197 return true;
38198 }
38199 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
38200 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
38201 SrcVT = MVT::v4i32;
38202 DstVT = MVT::v8i16;
38203 Shuffle = X86ISD::PACKSS;
38204 return true;
38205 }
38206 }
38207
38208 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
38209 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
38210 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38211 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
38212 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38213 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
38214 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
38215 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
38216 Subtarget)) {
38217 SrcVT = DstVT = MaskVT;
38218 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
38219 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
38220 return true;
38221 }
38222 }
38223
38224 // Attempt to match against a OR if we're performing a blend shuffle and the
38225 // non-blended source element is zero in each case.
38226 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
38227 if (SizeInBits == V1.getValueSizeInBits() &&
38228 SizeInBits == V2.getValueSizeInBits() &&
38229 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
38230 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
38231 bool IsBlend = true;
38232 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
38233 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
38234 unsigned Scale1 = NumV1Elts / NumMaskElts;
38235 unsigned Scale2 = NumV2Elts / NumMaskElts;
38236 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
38237 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
38238 for (unsigned i = 0; i != NumMaskElts; ++i) {
38239 int M = Mask[i];
38240 if (M == SM_SentinelUndef)
38241 continue;
38242 if (M == SM_SentinelZero) {
38243 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
38244 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
38245 continue;
38246 }
38247 if (M == (int)i) {
38248 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
38249 continue;
38250 }
38251 if (M == (int)(i + NumMaskElts)) {
38252 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
38253 continue;
38254 }
38255 IsBlend = false;
38256 break;
38257 }
38258 if (IsBlend) {
38259 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
38260 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
38261 Shuffle = ISD::OR;
38262 SrcVT = DstVT = MaskVT.changeTypeToInteger();
38263 return true;
38264 }
38265 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
38266 // FIXME: handle mismatched sizes?
38267 // TODO: investigate if `ISD::OR` handling in
38268 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
38269 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
38270 unsigned NumElts = V.getValueType().getVectorNumElements();
38271 KnownBits Known(NumElts);
38272 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
38273 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
38274 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
38275 if (PeepholeKnown.isZero())
38276 Known.Zero.setBit(EltIdx);
38277 if (PeepholeKnown.isAllOnes())
38278 Known.One.setBit(EltIdx);
38279 }
38280 return Known;
38281 };
38282
38283 KnownBits V1Known = computeKnownBitsElementWise(V1);
38284 KnownBits V2Known = computeKnownBitsElementWise(V2);
38285
38286 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
38287 int M = Mask[i];
38288 if (M == SM_SentinelUndef)
38289 continue;
38290 if (M == SM_SentinelZero) {
38291 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
38292 continue;
38293 }
38294 if (M == (int)i) {
38295 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
38296 continue;
38297 }
38298 if (M == (int)(i + NumMaskElts)) {
38299 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
38300 continue;
38301 }
38302 llvm_unreachable("will not get here.");
38303 }
38304 if (IsBlend) {
38305 Shuffle = ISD::OR;
38306 SrcVT = DstVT = MaskVT.changeTypeToInteger();
38307 return true;
38308 }
38309 }
38310 }
38311 }
38312
38313 return false;
38314}
38315
38317 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
38318 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
38319 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
38320 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
38321 unsigned NumMaskElts = Mask.size();
38322 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
38323
38324 // Attempt to match against VALIGND/VALIGNQ rotate.
38325 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
38326 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
38327 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
38328 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38329 if (!isAnyZero(Mask)) {
38330 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
38331 if (0 < Rotation) {
38332 Shuffle = X86ISD::VALIGN;
38333 if (EltSizeInBits == 64)
38334 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
38335 else
38336 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
38337 PermuteImm = Rotation;
38338 return true;
38339 }
38340 }
38341 }
38342
38343 // Attempt to match against PALIGNR byte rotate.
38344 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38345 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38346 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
38347 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
38348 if (0 < ByteRotation) {
38349 Shuffle = X86ISD::PALIGNR;
38350 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
38351 PermuteImm = ByteRotation;
38352 return true;
38353 }
38354 }
38355
38356 // Attempt to combine to X86ISD::BLENDI.
38357 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
38358 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
38359 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
38360 uint64_t BlendMask = 0;
38361 bool ForceV1Zero = false, ForceV2Zero = false;
38362 SmallVector<int, 8> TargetMask(Mask);
38363 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
38364 ForceV2Zero, BlendMask)) {
38365 if (MaskVT == MVT::v16i16) {
38366 // We can only use v16i16 PBLENDW if the lanes are repeated.
38367 SmallVector<int, 8> RepeatedMask;
38368 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
38369 RepeatedMask)) {
38370 assert(RepeatedMask.size() == 8 &&
38371 "Repeated mask size doesn't match!");
38372 PermuteImm = 0;
38373 for (int i = 0; i < 8; ++i)
38374 if (RepeatedMask[i] >= 8)
38375 PermuteImm |= 1 << i;
38376 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38377 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38378 Shuffle = X86ISD::BLENDI;
38379 ShuffleVT = MaskVT;
38380 return true;
38381 }
38382 } else {
38383 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38384 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38385 PermuteImm = (unsigned)BlendMask;
38386 Shuffle = X86ISD::BLENDI;
38387 ShuffleVT = MaskVT;
38388 return true;
38389 }
38390 }
38391 }
38392
38393 // Attempt to combine to INSERTPS, but only if it has elements that need to
38394 // be set to zero.
38395 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38396 MaskVT.is128BitVector() && isAnyZero(Mask) &&
38397 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38398 Shuffle = X86ISD::INSERTPS;
38399 ShuffleVT = MVT::v4f32;
38400 return true;
38401 }
38402
38403 // Attempt to combine to SHUFPD.
38404 if (AllowFloatDomain && EltSizeInBits == 64 &&
38405 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38406 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38407 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38408 bool ForceV1Zero = false, ForceV2Zero = false;
38409 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
38410 PermuteImm, Mask, Zeroable)) {
38411 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38412 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38413 Shuffle = X86ISD::SHUFP;
38414 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
38415 return true;
38416 }
38417 }
38418
38419 // Attempt to combine to SHUFPS.
38420 if (AllowFloatDomain && EltSizeInBits == 32 &&
38421 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
38422 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38423 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38424 SmallVector<int, 4> RepeatedMask;
38425 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
38426 // Match each half of the repeated mask, to determine if its just
38427 // referencing one of the vectors, is zeroable or entirely undef.
38428 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
38429 int M0 = RepeatedMask[Offset];
38430 int M1 = RepeatedMask[Offset + 1];
38431
38432 if (isUndefInRange(RepeatedMask, Offset, 2)) {
38433 return DAG.getUNDEF(MaskVT);
38434 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
38435 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
38436 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
38437 return getZeroVector(MaskVT, Subtarget, DAG, DL);
38438 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
38439 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38440 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38441 return V1;
38442 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
38443 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38444 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38445 return V2;
38446 }
38447
38448 return SDValue();
38449 };
38450
38451 int ShufMask[4] = {-1, -1, -1, -1};
38452 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
38453 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
38454
38455 if (Lo && Hi) {
38456 V1 = Lo;
38457 V2 = Hi;
38458 Shuffle = X86ISD::SHUFP;
38459 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
38460 PermuteImm = getV4X86ShuffleImm(ShufMask);
38461 return true;
38462 }
38463 }
38464 }
38465
38466 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
38467 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38468 MaskVT.is128BitVector() &&
38469 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38470 Shuffle = X86ISD::INSERTPS;
38471 ShuffleVT = MVT::v4f32;
38472 return true;
38473 }
38474
38475 return false;
38476}
38477
38479 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38480 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38481 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38482 const X86Subtarget &Subtarget);
38483
38484/// Combine an arbitrary chain of shuffles into a single instruction if
38485/// possible.
38486///
38487/// This is the leaf of the recursive combine below. When we have found some
38488/// chain of single-use x86 shuffle instructions and accumulated the combined
38489/// shuffle mask represented by them, this will try to pattern match that mask
38490/// into either a single instruction if there is a special purpose instruction
38491/// for this operation, or into a PSHUFB instruction which is a fully general
38492/// instruction but should only be used to replace chains over a certain depth.
38494 ArrayRef<int> BaseMask, int Depth,
38495 bool HasVariableMask,
38496 bool AllowVariableCrossLaneMask,
38497 bool AllowVariablePerLaneMask,
38498 SelectionDAG &DAG,
38499 const X86Subtarget &Subtarget) {
38500 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
38501 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
38502 "Unexpected number of shuffle inputs!");
38503
38504 SDLoc DL(Root);
38505 MVT RootVT = Root.getSimpleValueType();
38506 unsigned RootSizeInBits = RootVT.getSizeInBits();
38507 unsigned NumRootElts = RootVT.getVectorNumElements();
38508
38509 // Canonicalize shuffle input op to the requested type.
38510 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
38511 if (VT.getSizeInBits() > Op.getValueSizeInBits())
38512 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
38513 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
38514 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
38515 return DAG.getBitcast(VT, Op);
38516 };
38517
38518 // Find the inputs that enter the chain. Note that multiple uses are OK
38519 // here, we're not going to remove the operands we find.
38520 bool UnaryShuffle = (Inputs.size() == 1);
38521 SDValue V1 = peekThroughBitcasts(Inputs[0]);
38522 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
38523 : peekThroughBitcasts(Inputs[1]));
38524
38525 MVT VT1 = V1.getSimpleValueType();
38526 MVT VT2 = V2.getSimpleValueType();
38527 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
38528 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
38529
38530 SDValue Res;
38531
38532 unsigned NumBaseMaskElts = BaseMask.size();
38533 if (NumBaseMaskElts == 1) {
38534 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
38535 return CanonicalizeShuffleInput(RootVT, V1);
38536 }
38537
38538 bool OptForSize = DAG.shouldOptForSize();
38539 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
38540 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
38541 (RootVT.isFloatingPoint() && Depth >= 1) ||
38542 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
38543
38544 // Don't combine if we are a AVX512/EVEX target and the mask element size
38545 // is different from the root element size - this would prevent writemasks
38546 // from being reused.
38547 bool IsMaskedShuffle = false;
38548 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
38549 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
38550 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
38551 IsMaskedShuffle = true;
38552 }
38553 }
38554
38555 // If we are shuffling a splat (and not introducing zeros) then we can just
38556 // use it directly. This works for smaller elements as well as they already
38557 // repeat across each mask element.
38558 if (UnaryShuffle && !isAnyZero(BaseMask) &&
38559 V1.getValueSizeInBits() >= RootSizeInBits &&
38560 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
38561 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
38562 return CanonicalizeShuffleInput(RootVT, V1);
38563 }
38564
38565 SmallVector<int, 64> Mask(BaseMask);
38566
38567 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
38568 // etc. can be simplified.
38569 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
38570 SmallVector<int> ScaledMask, IdentityMask;
38571 unsigned NumElts = VT1.getVectorNumElements();
38572 if (Mask.size() <= NumElts &&
38573 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
38574 for (unsigned i = 0; i != NumElts; ++i)
38575 IdentityMask.push_back(i);
38576 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
38577 V2))
38578 return CanonicalizeShuffleInput(RootVT, V1);
38579 }
38580 }
38581
38582 // Handle 128/256-bit lane shuffles of 512-bit vectors.
38583 if (RootVT.is512BitVector() &&
38584 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
38585 // If the upper subvectors are zeroable, then an extract+insert is more
38586 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
38587 // to zero the upper subvectors.
38588 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
38589 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38590 return SDValue(); // Nothing to do!
38591 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
38592 "Unexpected lane shuffle");
38593 Res = CanonicalizeShuffleInput(RootVT, V1);
38594 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
38595 bool UseZero = isAnyZero(Mask);
38596 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
38597 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
38598 }
38599
38600 // Narrow shuffle mask to v4x128.
38601 SmallVector<int, 4> ScaledMask;
38602 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
38603 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
38604
38605 // Try to lower to vshuf64x2/vshuf32x4.
38606 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
38607 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
38608 SelectionDAG &DAG) {
38609 int PermMask[4] = {-1, -1, -1, -1};
38610 // Ensure elements came from the same Op.
38611 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
38612 for (int i = 0; i < 4; ++i) {
38613 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
38614 if (ScaledMask[i] < 0)
38615 continue;
38616
38617 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
38618 unsigned OpIndex = i / 2;
38619 if (Ops[OpIndex].isUndef())
38620 Ops[OpIndex] = Op;
38621 else if (Ops[OpIndex] != Op)
38622 return SDValue();
38623
38624 PermMask[i] = ScaledMask[i] % 4;
38625 }
38626
38627 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
38628 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
38629 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
38630 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
38631 };
38632
38633 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
38634 // doesn't work because our mask is for 128 bits and we don't have an MVT
38635 // to match that.
38636 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
38637 isUndefOrInRange(ScaledMask[1], 0, 2) &&
38638 isUndefOrInRange(ScaledMask[2], 2, 4) &&
38639 isUndefOrInRange(ScaledMask[3], 2, 4) &&
38640 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
38641 ScaledMask[0] == (ScaledMask[2] % 2)) &&
38642 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
38643 ScaledMask[1] == (ScaledMask[3] % 2));
38644
38645 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
38646 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38647 return SDValue(); // Nothing to do!
38648 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
38649 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
38650 return DAG.getBitcast(RootVT, V);
38651 }
38652 }
38653
38654 // Handle 128-bit lane shuffles of 256-bit vectors.
38655 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
38656 // If the upper half is zeroable, then an extract+insert is more optimal
38657 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
38658 // zero the upper half.
38659 if (isUndefOrZero(Mask[1])) {
38660 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38661 return SDValue(); // Nothing to do!
38662 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
38663 Res = CanonicalizeShuffleInput(RootVT, V1);
38664 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
38665 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
38666 256);
38667 }
38668
38669 // If we're inserting the low subvector, an insert-subvector 'concat'
38670 // pattern is quicker than VPERM2X128.
38671 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
38672 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
38673 !Subtarget.hasAVX2()) {
38674 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38675 return SDValue(); // Nothing to do!
38676 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
38677 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
38678 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
38679 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
38680 }
38681
38682 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
38683 return SDValue(); // Nothing to do!
38684
38685 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
38686 // we need to use the zeroing feature.
38687 // Prefer blends for sequential shuffles unless we are optimizing for size.
38688 if (UnaryShuffle &&
38689 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
38690 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
38691 unsigned PermMask = 0;
38692 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
38693 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
38694 return DAG.getNode(
38695 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
38696 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
38697 }
38698
38699 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38700 return SDValue(); // Nothing to do!
38701
38702 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
38703 if (!UnaryShuffle && !IsMaskedShuffle) {
38704 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
38705 "Unexpected shuffle sentinel value");
38706 // Prefer blends to X86ISD::VPERM2X128.
38707 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
38708 unsigned PermMask = 0;
38709 PermMask |= ((Mask[0] & 3) << 0);
38710 PermMask |= ((Mask[1] & 3) << 4);
38711 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
38712 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
38713 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
38714 CanonicalizeShuffleInput(RootVT, LHS),
38715 CanonicalizeShuffleInput(RootVT, RHS),
38716 DAG.getTargetConstant(PermMask, DL, MVT::i8));
38717 }
38718 }
38719 }
38720
38721 // For masks that have been widened to 128-bit elements or more,
38722 // narrow back down to 64-bit elements.
38723 if (BaseMaskEltSizeInBits > 64) {
38724 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
38725 int MaskScale = BaseMaskEltSizeInBits / 64;
38726 SmallVector<int, 64> ScaledMask;
38727 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38728 Mask = std::move(ScaledMask);
38729 }
38730
38731 // For masked shuffles, we're trying to match the root width for better
38732 // writemask folding, attempt to scale the mask.
38733 // TODO - variable shuffles might need this to be widened again.
38734 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
38735 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
38736 int MaskScale = NumRootElts / Mask.size();
38737 SmallVector<int, 64> ScaledMask;
38738 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38739 Mask = std::move(ScaledMask);
38740 }
38741
38742 unsigned NumMaskElts = Mask.size();
38743 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
38744 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38745
38746 // Determine the effective mask value type.
38747 FloatDomain &= (32 <= MaskEltSizeInBits);
38748 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
38749 : MVT::getIntegerVT(MaskEltSizeInBits);
38750 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
38751
38752 // Only allow legal mask types.
38753 if (!TLI.isTypeLegal(MaskVT))
38754 return SDValue();
38755
38756 // Attempt to match the mask against known shuffle patterns.
38757 MVT ShuffleSrcVT, ShuffleVT;
38758 unsigned Shuffle, PermuteImm;
38759
38760 // Which shuffle domains are permitted?
38761 // Permit domain crossing at higher combine depths.
38762 // TODO: Should we indicate which domain is preferred if both are allowed?
38763 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
38764 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
38765 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
38766
38767 // Determine zeroable mask elements.
38768 APInt KnownUndef, KnownZero;
38769 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
38770 APInt Zeroable = KnownUndef | KnownZero;
38771
38772 if (UnaryShuffle) {
38773 // Attempt to match against broadcast-from-vector.
38774 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
38775 if ((Subtarget.hasAVX2() ||
38776 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
38777 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
38778 if (isUndefOrEqual(Mask, 0)) {
38779 if (V1.getValueType() == MaskVT &&
38781 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
38782 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38783 return SDValue(); // Nothing to do!
38784 Res = V1.getOperand(0);
38785 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38786 return DAG.getBitcast(RootVT, Res);
38787 }
38788 if (Subtarget.hasAVX2()) {
38789 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38790 return SDValue(); // Nothing to do!
38791 Res = CanonicalizeShuffleInput(MaskVT, V1);
38792 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38793 return DAG.getBitcast(RootVT, Res);
38794 }
38795 }
38796 }
38797
38798 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
38799 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
38800 (!IsMaskedShuffle ||
38801 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38802 if (Depth == 0 && Root.getOpcode() == Shuffle)
38803 return SDValue(); // Nothing to do!
38804 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38805 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
38806 return DAG.getBitcast(RootVT, Res);
38807 }
38808
38809 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38810 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
38811 PermuteImm) &&
38812 (!IsMaskedShuffle ||
38813 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38814 if (Depth == 0 && Root.getOpcode() == Shuffle)
38815 return SDValue(); // Nothing to do!
38816 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
38817 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
38818 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38819 return DAG.getBitcast(RootVT, Res);
38820 }
38821 }
38822
38823 // Attempt to combine to INSERTPS, but only if the inserted element has come
38824 // from a scalar.
38825 // TODO: Handle other insertions here as well?
38826 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
38827 Subtarget.hasSSE41() &&
38828 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
38829 if (MaskEltSizeInBits == 32) {
38830 SDValue SrcV1 = V1, SrcV2 = V2;
38831 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
38832 DAG) &&
38833 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
38834 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38835 return SDValue(); // Nothing to do!
38836 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38837 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
38838 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
38839 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38840 return DAG.getBitcast(RootVT, Res);
38841 }
38842 }
38843 if (MaskEltSizeInBits == 64 &&
38844 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
38845 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38846 V2.getScalarValueSizeInBits() <= 32) {
38847 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38848 return SDValue(); // Nothing to do!
38849 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
38850 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38851 CanonicalizeShuffleInput(MVT::v4f32, V1),
38852 CanonicalizeShuffleInput(MVT::v4f32, V2),
38853 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38854 return DAG.getBitcast(RootVT, Res);
38855 }
38856 }
38857
38858 SDValue NewV1 = V1; // Save operands in case early exit happens.
38859 SDValue NewV2 = V2;
38860 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
38861 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
38862 ShuffleVT, UnaryShuffle) &&
38863 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38864 if (Depth == 0 && Root.getOpcode() == Shuffle)
38865 return SDValue(); // Nothing to do!
38866 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
38867 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
38868 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
38869 return DAG.getBitcast(RootVT, Res);
38870 }
38871
38872 NewV1 = V1; // Save operands in case early exit happens.
38873 NewV2 = V2;
38874 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38875 AllowIntDomain, NewV1, NewV2, DL, DAG,
38876 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
38877 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38878 if (Depth == 0 && Root.getOpcode() == Shuffle)
38879 return SDValue(); // Nothing to do!
38880 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
38881 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
38882 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
38883 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38884 return DAG.getBitcast(RootVT, Res);
38885 }
38886
38887 // Typically from here on, we need an integer version of MaskVT.
38888 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
38889 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
38890
38891 // Annoyingly, SSE4A instructions don't map into the above match helpers.
38892 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
38893 uint64_t BitLen, BitIdx;
38894 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
38895 Zeroable)) {
38896 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
38897 return SDValue(); // Nothing to do!
38898 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38899 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
38900 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38901 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38902 return DAG.getBitcast(RootVT, Res);
38903 }
38904
38905 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
38906 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
38907 return SDValue(); // Nothing to do!
38908 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38909 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
38910 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
38911 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38912 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38913 return DAG.getBitcast(RootVT, Res);
38914 }
38915 }
38916
38917 // Match shuffle against TRUNCATE patterns.
38918 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
38919 // Match against a VTRUNC instruction, accounting for src/dst sizes.
38920 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
38921 Subtarget)) {
38922 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
38923 ShuffleSrcVT.getVectorNumElements();
38924 unsigned Opc =
38925 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
38926 if (Depth == 0 && Root.getOpcode() == Opc)
38927 return SDValue(); // Nothing to do!
38928 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38929 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
38930 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
38931 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
38932 return DAG.getBitcast(RootVT, Res);
38933 }
38934
38935 // Do we need a more general binary truncation pattern?
38936 if (RootSizeInBits < 512 &&
38937 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
38938 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
38939 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
38940 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
38941 // Bail if this was already a truncation or PACK node.
38942 // We sometimes fail to match PACK if we demand known undef elements.
38943 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
38944 Root.getOpcode() == X86ISD::PACKSS ||
38945 Root.getOpcode() == X86ISD::PACKUS))
38946 return SDValue(); // Nothing to do!
38947 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38948 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
38949 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38950 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
38951 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38952 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
38953 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
38954 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
38955 return DAG.getBitcast(RootVT, Res);
38956 }
38957 }
38958
38959 // Don't try to re-form single instruction chains under any circumstances now
38960 // that we've done encoding canonicalization for them.
38961 if (Depth < 1)
38962 return SDValue();
38963
38964 // Depth threshold above which we can efficiently use variable mask shuffles.
38965 int VariableCrossLaneShuffleDepth =
38966 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
38967 int VariablePerLaneShuffleDepth =
38968 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
38969 AllowVariableCrossLaneMask &=
38970 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
38971 AllowVariablePerLaneMask &=
38972 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
38973 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
38974 // higher depth before combining them.
38975 bool AllowBWIVPERMV3 =
38976 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
38977
38978 bool MaskContainsZeros = isAnyZero(Mask);
38979
38980 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
38981 // If we have a single input lane-crossing shuffle then lower to VPERMV.
38982 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
38983 if (Subtarget.hasAVX2() &&
38984 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
38985 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
38986 Res = CanonicalizeShuffleInput(MaskVT, V1);
38987 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
38988 return DAG.getBitcast(RootVT, Res);
38989 }
38990 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
38991 if ((Subtarget.hasAVX512() &&
38992 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38993 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38994 (Subtarget.hasBWI() &&
38995 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38996 (Subtarget.hasVBMI() &&
38997 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
38998 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38999 V2 = DAG.getUNDEF(MaskVT);
39000 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
39001 return DAG.getBitcast(RootVT, Res);
39002 }
39003 }
39004
39005 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
39006 // vector as the second source (non-VLX will pad to 512-bit shuffles).
39007 if (UnaryShuffle && AllowVariableCrossLaneMask &&
39008 ((Subtarget.hasAVX512() &&
39009 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
39010 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
39011 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
39012 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
39013 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
39014 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
39015 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
39016 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
39017 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
39018 for (unsigned i = 0; i != NumMaskElts; ++i)
39019 if (Mask[i] == SM_SentinelZero)
39020 Mask[i] = NumMaskElts + i;
39021 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39022 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
39023 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
39024 return DAG.getBitcast(RootVT, Res);
39025 }
39026
39027 // If that failed and either input is extracted then try to combine as a
39028 // shuffle with the larger type.
39030 Inputs, Root, BaseMask, Depth, HasVariableMask,
39031 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
39032 Subtarget))
39033 return WideShuffle;
39034
39035 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
39036 // (non-VLX will pad to 512-bit shuffles).
39037 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
39038 ((Subtarget.hasAVX512() &&
39039 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
39040 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
39041 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
39042 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
39043 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
39044 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
39045 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
39046 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
39047 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39048 V2 = CanonicalizeShuffleInput(MaskVT, V2);
39049 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
39050 return DAG.getBitcast(RootVT, Res);
39051 }
39052 return SDValue();
39053 }
39054
39055 // See if we can combine a single input shuffle with zeros to a bit-mask,
39056 // which is much simpler than any shuffle.
39057 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
39058 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
39059 TLI.isTypeLegal(MaskVT)) {
39060 APInt Zero = APInt::getZero(MaskEltSizeInBits);
39061 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
39062 APInt UndefElts(NumMaskElts, 0);
39063 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
39064 for (unsigned i = 0; i != NumMaskElts; ++i) {
39065 int M = Mask[i];
39066 if (M == SM_SentinelUndef) {
39067 UndefElts.setBit(i);
39068 continue;
39069 }
39070 if (M == SM_SentinelZero)
39071 continue;
39072 EltBits[i] = AllOnes;
39073 }
39074 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
39075 Res = CanonicalizeShuffleInput(MaskVT, V1);
39076 unsigned AndOpcode =
39078 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
39079 return DAG.getBitcast(RootVT, Res);
39080 }
39081
39082 // If we have a single input shuffle with different shuffle patterns in the
39083 // the 128-bit lanes use the variable mask to VPERMILPS.
39084 // TODO Combine other mask types at higher depths.
39085 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
39086 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
39087 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
39088 SmallVector<SDValue, 16> VPermIdx;
39089 for (int M : Mask) {
39090 SDValue Idx =
39091 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
39092 VPermIdx.push_back(Idx);
39093 }
39094 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
39095 Res = CanonicalizeShuffleInput(MaskVT, V1);
39096 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
39097 return DAG.getBitcast(RootVT, Res);
39098 }
39099
39100 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
39101 // to VPERMIL2PD/VPERMIL2PS.
39102 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
39103 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
39104 MaskVT == MVT::v8f32)) {
39105 // VPERMIL2 Operation.
39106 // Bits[3] - Match Bit.
39107 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
39108 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
39109 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
39110 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
39111 SmallVector<int, 8> VPerm2Idx;
39112 unsigned M2ZImm = 0;
39113 for (int M : Mask) {
39114 if (M == SM_SentinelUndef) {
39115 VPerm2Idx.push_back(-1);
39116 continue;
39117 }
39118 if (M == SM_SentinelZero) {
39119 M2ZImm = 2;
39120 VPerm2Idx.push_back(8);
39121 continue;
39122 }
39123 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
39124 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
39125 VPerm2Idx.push_back(Index);
39126 }
39127 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39128 V2 = CanonicalizeShuffleInput(MaskVT, V2);
39129 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
39130 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
39131 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
39132 return DAG.getBitcast(RootVT, Res);
39133 }
39134
39135 // If we have 3 or more shuffle instructions or a chain involving a variable
39136 // mask, we can replace them with a single PSHUFB instruction profitably.
39137 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
39138 // instructions, but in practice PSHUFB tends to be *very* fast so we're
39139 // more aggressive.
39140 if (UnaryShuffle && AllowVariablePerLaneMask &&
39141 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39142 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
39143 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
39144 SmallVector<SDValue, 16> PSHUFBMask;
39145 int NumBytes = RootVT.getSizeInBits() / 8;
39146 int Ratio = NumBytes / NumMaskElts;
39147 for (int i = 0; i < NumBytes; ++i) {
39148 int M = Mask[i / Ratio];
39149 if (M == SM_SentinelUndef) {
39150 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
39151 continue;
39152 }
39153 if (M == SM_SentinelZero) {
39154 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
39155 continue;
39156 }
39157 M = Ratio * M + i % Ratio;
39158 assert((M / 16) == (i / 16) && "Lane crossing detected");
39159 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
39160 }
39161 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
39162 Res = CanonicalizeShuffleInput(ByteVT, V1);
39163 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
39164 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
39165 return DAG.getBitcast(RootVT, Res);
39166 }
39167
39168 // With XOP, if we have a 128-bit binary input shuffle we can always combine
39169 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
39170 // slower than PSHUFB on targets that support both.
39171 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
39172 Subtarget.hasXOP()) {
39173 // VPPERM Mask Operation
39174 // Bits[4:0] - Byte Index (0 - 31)
39175 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
39176 SmallVector<SDValue, 16> VPPERMMask;
39177 int NumBytes = 16;
39178 int Ratio = NumBytes / NumMaskElts;
39179 for (int i = 0; i < NumBytes; ++i) {
39180 int M = Mask[i / Ratio];
39181 if (M == SM_SentinelUndef) {
39182 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
39183 continue;
39184 }
39185 if (M == SM_SentinelZero) {
39186 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
39187 continue;
39188 }
39189 M = Ratio * M + i % Ratio;
39190 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
39191 }
39192 MVT ByteVT = MVT::v16i8;
39193 V1 = CanonicalizeShuffleInput(ByteVT, V1);
39194 V2 = CanonicalizeShuffleInput(ByteVT, V2);
39195 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
39196 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
39197 return DAG.getBitcast(RootVT, Res);
39198 }
39199
39200 // If that failed and either input is extracted then try to combine as a
39201 // shuffle with the larger type.
39203 Inputs, Root, BaseMask, Depth, HasVariableMask,
39204 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
39205 return WideShuffle;
39206
39207 // If we have a dual input shuffle then lower to VPERMV3,
39208 // (non-VLX will pad to 512-bit shuffles)
39209 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
39210 ((Subtarget.hasAVX512() &&
39211 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
39212 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
39213 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
39214 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
39215 MaskVT == MVT::v16i32)) ||
39216 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
39217 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
39218 MaskVT == MVT::v32i16)) ||
39219 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
39220 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
39221 MaskVT == MVT::v64i8)))) {
39222 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39223 V2 = CanonicalizeShuffleInput(MaskVT, V2);
39224 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
39225 return DAG.getBitcast(RootVT, Res);
39226 }
39227
39228 // Failed to find any combines.
39229 return SDValue();
39230}
39231
39232// Combine an arbitrary chain of shuffles + extract_subvectors into a single
39233// instruction if possible.
39234//
39235// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
39236// type size to attempt to combine:
39237// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
39238// -->
39239// extract_subvector(shuffle(x,y,m2),0)
39241 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39242 bool HasVariableMask, bool AllowVariableCrossLaneMask,
39243 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39244 const X86Subtarget &Subtarget) {
39245 unsigned NumMaskElts = BaseMask.size();
39246 unsigned NumInputs = Inputs.size();
39247 if (NumInputs == 0)
39248 return SDValue();
39249
39250 EVT RootVT = Root.getValueType();
39251 unsigned RootSizeInBits = RootVT.getSizeInBits();
39252 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
39253 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
39254
39255 // Peek through extract_subvector to find widest legal vector.
39256 // TODO: Handle ISD::TRUNCATE
39257 unsigned WideSizeInBits = RootSizeInBits;
39258 for (unsigned I = 0; I != NumInputs; ++I) {
39259 SDValue Input = peekThroughBitcasts(Inputs[I]);
39260 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
39261 Input = peekThroughBitcasts(Input.getOperand(0));
39262 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
39263 WideSizeInBits < Input.getValueSizeInBits())
39264 WideSizeInBits = Input.getValueSizeInBits();
39265 }
39266
39267 // Bail if we fail to find a source larger than the existing root.
39268 unsigned Scale = WideSizeInBits / RootSizeInBits;
39269 if (WideSizeInBits <= RootSizeInBits ||
39270 (WideSizeInBits % RootSizeInBits) != 0)
39271 return SDValue();
39272
39273 // Create new mask for larger type.
39274 SmallVector<int, 64> WideMask(BaseMask);
39275 for (int &M : WideMask) {
39276 if (M < 0)
39277 continue;
39278 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
39279 }
39280 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
39281
39282 // Attempt to peek through inputs and adjust mask when we extract from an
39283 // upper subvector.
39284 int AdjustedMasks = 0;
39285 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
39286 for (unsigned I = 0; I != NumInputs; ++I) {
39287 SDValue &Input = WideInputs[I];
39288 Input = peekThroughBitcasts(Input);
39289 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39290 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
39292 if (Idx != 0) {
39293 ++AdjustedMasks;
39294 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
39295 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
39296
39297 int lo = I * WideMask.size();
39298 int hi = (I + 1) * WideMask.size();
39299 for (int &M : WideMask)
39300 if (lo <= M && M < hi)
39301 M += Idx;
39302 }
39303 Input = peekThroughBitcasts(Input.getOperand(0));
39304 }
39305 }
39306
39307 // Remove unused/repeated shuffle source ops.
39308 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
39309 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
39310
39311 // Bail if we're always extracting from the lowest subvectors,
39312 // combineX86ShuffleChain should match this for the current width, or the
39313 // shuffle still references too many inputs.
39314 if (AdjustedMasks == 0 || WideInputs.size() > 2)
39315 return SDValue();
39316
39317 // Minor canonicalization of the accumulated shuffle mask to make it easier
39318 // to match below. All this does is detect masks with sequential pairs of
39319 // elements, and shrink them to the half-width mask. It does this in a loop
39320 // so it will reduce the size of the mask to the minimal width mask which
39321 // performs an equivalent shuffle.
39322 while (WideMask.size() > 1) {
39323 SmallVector<int, 64> WidenedMask;
39324 if (!canWidenShuffleElements(WideMask, WidenedMask))
39325 break;
39326 WideMask = std::move(WidenedMask);
39327 }
39328
39329 // Canonicalization of binary shuffle masks to improve pattern matching by
39330 // commuting the inputs.
39331 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
39333 std::swap(WideInputs[0], WideInputs[1]);
39334 }
39335
39336 // Increase depth for every upper subvector we've peeked through.
39337 Depth += AdjustedMasks;
39338
39339 // Attempt to combine wider chain.
39340 // TODO: Can we use a better Root?
39341 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
39342 WideInputs.back().getValueSizeInBits()
39343 ? WideInputs.front()
39344 : WideInputs.back();
39345 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
39346 "WideRootSize mismatch");
39347
39348 if (SDValue WideShuffle =
39349 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
39350 HasVariableMask, AllowVariableCrossLaneMask,
39351 AllowVariablePerLaneMask, DAG, Subtarget)) {
39352 WideShuffle =
39353 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
39354 return DAG.getBitcast(RootVT, WideShuffle);
39355 }
39356
39357 return SDValue();
39358}
39359
39360// Canonicalize the combined shuffle mask chain with horizontal ops.
39361// NOTE: This may update the Ops and Mask.
39364 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
39365 const X86Subtarget &Subtarget) {
39366 if (Mask.empty() || Ops.empty())
39367 return SDValue();
39368
39370 for (SDValue Op : Ops)
39372
39373 // All ops must be the same horizop + type.
39374 SDValue BC0 = BC[0];
39375 EVT VT0 = BC0.getValueType();
39376 unsigned Opcode0 = BC0.getOpcode();
39377 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
39378 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
39379 }))
39380 return SDValue();
39381
39382 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
39383 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
39384 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
39385 if (!isHoriz && !isPack)
39386 return SDValue();
39387
39388 // Do all ops have a single use?
39389 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
39390 return Op.hasOneUse() &&
39392 });
39393
39394 int NumElts = VT0.getVectorNumElements();
39395 int NumLanes = VT0.getSizeInBits() / 128;
39396 int NumEltsPerLane = NumElts / NumLanes;
39397 int NumHalfEltsPerLane = NumEltsPerLane / 2;
39398 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
39399 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39400
39401 if (NumEltsPerLane >= 4 &&
39402 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
39403 SmallVector<int> LaneMask, ScaledMask;
39404 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
39405 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
39406 // See if we can remove the shuffle by resorting the HOP chain so that
39407 // the HOP args are pre-shuffled.
39408 // TODO: Generalize to any sized/depth chain.
39409 // TODO: Add support for PACKSS/PACKUS.
39410 if (isHoriz) {
39411 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
39412 auto GetHOpSrc = [&](int M) {
39413 if (M == SM_SentinelUndef)
39414 return DAG.getUNDEF(VT0);
39415 if (M == SM_SentinelZero)
39416 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
39417 SDValue Src0 = BC[M / 4];
39418 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
39419 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
39420 return Src1.getOperand(M % 2);
39421 return SDValue();
39422 };
39423 SDValue M0 = GetHOpSrc(ScaledMask[0]);
39424 SDValue M1 = GetHOpSrc(ScaledMask[1]);
39425 SDValue M2 = GetHOpSrc(ScaledMask[2]);
39426 SDValue M3 = GetHOpSrc(ScaledMask[3]);
39427 if (M0 && M1 && M2 && M3) {
39428 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
39429 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
39430 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39431 }
39432 }
39433 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
39434 if (Ops.size() >= 2) {
39435 SDValue LHS, RHS;
39436 auto GetHOpSrc = [&](int M, int &OutM) {
39437 // TODO: Support SM_SentinelZero
39438 if (M < 0)
39439 return M == SM_SentinelUndef;
39440 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
39441 if (!LHS || LHS == Src) {
39442 LHS = Src;
39443 OutM = (M % 2);
39444 return true;
39445 }
39446 if (!RHS || RHS == Src) {
39447 RHS = Src;
39448 OutM = (M % 2) + 2;
39449 return true;
39450 }
39451 return false;
39452 };
39453 int PostMask[4] = {-1, -1, -1, -1};
39454 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
39455 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
39456 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
39457 GetHOpSrc(ScaledMask[3], PostMask[3])) {
39458 LHS = DAG.getBitcast(SrcVT, LHS);
39459 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
39460 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39461 // Use SHUFPS for the permute so this will work on SSE2 targets,
39462 // shuffle combining and domain handling will simplify this later on.
39463 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
39464 Res = DAG.getBitcast(ShuffleVT, Res);
39465 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
39466 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
39467 }
39468 }
39469 }
39470 }
39471
39472 if (2 < Ops.size())
39473 return SDValue();
39474
39475 SDValue BC1 = BC[BC.size() - 1];
39476 if (Mask.size() == VT0.getVectorNumElements()) {
39477 // Canonicalize binary shuffles of horizontal ops that use the
39478 // same sources to an unary shuffle.
39479 // TODO: Try to perform this fold even if the shuffle remains.
39480 if (Ops.size() == 2) {
39481 auto ContainsOps = [](SDValue HOp, SDValue Op) {
39482 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
39483 };
39484 // Commute if all BC0's ops are contained in BC1.
39485 if (ContainsOps(BC1, BC0.getOperand(0)) &&
39486 ContainsOps(BC1, BC0.getOperand(1))) {
39488 std::swap(Ops[0], Ops[1]);
39489 std::swap(BC0, BC1);
39490 }
39491
39492 // If BC1 can be represented by BC0, then convert to unary shuffle.
39493 if (ContainsOps(BC0, BC1.getOperand(0)) &&
39494 ContainsOps(BC0, BC1.getOperand(1))) {
39495 for (int &M : Mask) {
39496 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
39497 continue;
39498 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
39499 M -= NumElts + (SubLane * NumHalfEltsPerLane);
39500 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
39501 M += NumHalfEltsPerLane;
39502 }
39503 }
39504 }
39505
39506 // Canonicalize unary horizontal ops to only refer to lower halves.
39507 for (int i = 0; i != NumElts; ++i) {
39508 int &M = Mask[i];
39509 if (isUndefOrZero(M))
39510 continue;
39511 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
39512 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39513 M -= NumHalfEltsPerLane;
39514 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
39515 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39516 M -= NumHalfEltsPerLane;
39517 }
39518 }
39519
39520 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
39521 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
39522 // represents the LHS/RHS inputs for the lower/upper halves.
39523 SmallVector<int, 16> TargetMask128, WideMask128;
39524 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
39525 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
39526 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
39527 bool SingleOp = (Ops.size() == 1);
39528 if (isPack || OneUseOps ||
39529 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
39530 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
39531 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
39532 Lo = Lo.getOperand(WideMask128[0] & 1);
39533 Hi = Hi.getOperand(WideMask128[1] & 1);
39534 if (SingleOp) {
39535 SDValue Undef = DAG.getUNDEF(SrcVT);
39536 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
39537 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
39538 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
39539 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
39540 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
39541 }
39542 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
39543 }
39544 }
39545
39546 // If we are post-shuffling a 256-bit hop and not requiring the upper
39547 // elements, then try to narrow to a 128-bit hop directly.
39548 SmallVector<int, 16> WideMask64;
39549 if (Ops.size() == 1 && NumLanes == 2 &&
39550 scaleShuffleElements(Mask, 4, WideMask64) &&
39551 isUndefInRange(WideMask64, 2, 2)) {
39552 int M0 = WideMask64[0];
39553 int M1 = WideMask64[1];
39554 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
39556 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39557 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39558 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
39559 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
39560 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
39561 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
39562 }
39563 }
39564
39565 return SDValue();
39566}
39567
39568// Attempt to constant fold all of the constant source ops.
39569// Returns true if the entire shuffle is folded to a constant.
39570// TODO: Extend this to merge multiple constant Ops and update the mask.
39572 ArrayRef<int> Mask,
39573 bool HasVariableMask,
39574 SelectionDAG &DAG, const SDLoc &DL,
39575 const X86Subtarget &Subtarget) {
39576 unsigned SizeInBits = VT.getSizeInBits();
39577 unsigned NumMaskElts = Mask.size();
39578 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
39579 unsigned NumOps = Ops.size();
39580
39581 // Extract constant bits from each source op.
39582 SmallVector<APInt, 16> UndefEltsOps(NumOps);
39583 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
39584 for (unsigned I = 0; I != NumOps; ++I)
39585 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
39586 RawBitsOps[I],
39587 /*AllowWholeUndefs*/ true,
39588 /*AllowPartialUndefs*/ true))
39589 return SDValue();
39590
39591 // If we're optimizing for size, only fold if at least one of the constants is
39592 // only used once or the combined shuffle has included a variable mask
39593 // shuffle, this is to avoid constant pool bloat.
39594 bool IsOptimizingSize = DAG.shouldOptForSize();
39595 if (IsOptimizingSize && !HasVariableMask &&
39596 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
39597 return SDValue();
39598
39599 // Shuffle the constant bits according to the mask.
39600 APInt UndefElts(NumMaskElts, 0);
39601 APInt ZeroElts(NumMaskElts, 0);
39602 APInt ConstantElts(NumMaskElts, 0);
39603 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
39604 APInt::getZero(MaskSizeInBits));
39605 for (unsigned i = 0; i != NumMaskElts; ++i) {
39606 int M = Mask[i];
39607 if (M == SM_SentinelUndef) {
39608 UndefElts.setBit(i);
39609 continue;
39610 } else if (M == SM_SentinelZero) {
39611 ZeroElts.setBit(i);
39612 continue;
39613 }
39614 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
39615
39616 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
39617 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
39618
39619 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
39620 if (SrcUndefElts[SrcMaskIdx]) {
39621 UndefElts.setBit(i);
39622 continue;
39623 }
39624
39625 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
39626 APInt &Bits = SrcEltBits[SrcMaskIdx];
39627 if (!Bits) {
39628 ZeroElts.setBit(i);
39629 continue;
39630 }
39631
39632 ConstantElts.setBit(i);
39633 ConstantBitData[i] = Bits;
39634 }
39635 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
39636
39637 // Attempt to create a zero vector.
39638 if ((UndefElts | ZeroElts).isAllOnes())
39639 return getZeroVector(VT, Subtarget, DAG, DL);
39640
39641 // Create the constant data.
39642 MVT MaskSVT;
39643 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
39644 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
39645 else
39646 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
39647
39648 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
39649 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
39650 return SDValue();
39651
39652 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
39653 return DAG.getBitcast(VT, CstOp);
39654}
39655
39656namespace llvm {
39657 namespace X86 {
39658 enum {
39661 } // namespace X86
39662} // namespace llvm
39663
39664/// Fully generic combining of x86 shuffle instructions.
39665///
39666/// This should be the last combine run over the x86 shuffle instructions. Once
39667/// they have been fully optimized, this will recursively consider all chains
39668/// of single-use shuffle instructions, build a generic model of the cumulative
39669/// shuffle operation, and check for simpler instructions which implement this
39670/// operation. We use this primarily for two purposes:
39671///
39672/// 1) Collapse generic shuffles to specialized single instructions when
39673/// equivalent. In most cases, this is just an encoding size win, but
39674/// sometimes we will collapse multiple generic shuffles into a single
39675/// special-purpose shuffle.
39676/// 2) Look for sequences of shuffle instructions with 3 or more total
39677/// instructions, and replace them with the slightly more expensive SSSE3
39678/// PSHUFB instruction if available. We do this as the last combining step
39679/// to ensure we avoid using PSHUFB if we can implement the shuffle with
39680/// a suitable short sequence of other instructions. The PSHUFB will either
39681/// use a register or have to read from memory and so is slightly (but only
39682/// slightly) more expensive than the other shuffle instructions.
39683///
39684/// Because this is inherently a quadratic operation (for each shuffle in
39685/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
39686/// This should never be an issue in practice as the shuffle lowering doesn't
39687/// produce sequences of more than 8 instructions.
39688///
39689/// FIXME: We will currently miss some cases where the redundant shuffling
39690/// would simplify under the threshold for PSHUFB formation because of
39691/// combine-ordering. To fix this, we should do the redundant instruction
39692/// combining in this recursive walk.
39694 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
39695 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
39696 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
39697 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39698 const X86Subtarget &Subtarget) {
39699 assert(!RootMask.empty() &&
39700 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
39701 "Illegal shuffle root mask");
39702 MVT RootVT = Root.getSimpleValueType();
39703 assert(RootVT.isVector() && "Shuffles operate on vector types!");
39704 unsigned RootSizeInBits = RootVT.getSizeInBits();
39705 SDLoc DL(Root);
39706
39707 // Bound the depth of our recursive combine because this is ultimately
39708 // quadratic in nature.
39709 if (Depth >= MaxDepth)
39710 return SDValue();
39711
39712 // Directly rip through bitcasts to find the underlying operand.
39713 SDValue Op = SrcOps[SrcOpIndex];
39715
39716 EVT VT = Op.getValueType();
39717 if (!VT.isVector() || !VT.isSimple())
39718 return SDValue(); // Bail if we hit a non-simple non-vector.
39719
39720 // FIXME: Just bail on f16 for now.
39721 if (VT.getVectorElementType() == MVT::f16)
39722 return SDValue();
39723
39724 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
39725 "Can only combine shuffles upto size of the root op.");
39726
39727 // Create a demanded elts mask from the referenced elements of Op.
39728 APInt OpDemandedElts = APInt::getZero(RootMask.size());
39729 for (int M : RootMask) {
39730 int BaseIdx = RootMask.size() * SrcOpIndex;
39731 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
39732 OpDemandedElts.setBit(M - BaseIdx);
39733 }
39734 if (RootSizeInBits != VT.getSizeInBits()) {
39735 // Op is smaller than Root - extract the demanded elts for the subvector.
39736 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
39737 unsigned NumOpMaskElts = RootMask.size() / Scale;
39738 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
39739 assert(OpDemandedElts
39740 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
39741 .isZero() &&
39742 "Out of range elements referenced in root mask");
39743 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
39744 }
39745 OpDemandedElts =
39746 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
39747
39748 // Extract target shuffle mask and resolve sentinels and inputs.
39749 SmallVector<int, 64> OpMask;
39750 SmallVector<SDValue, 2> OpInputs;
39751 APInt OpUndef, OpZero;
39752 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
39753 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
39754 OpZero, DAG, Depth, false)) {
39755 // Shuffle inputs must not be larger than the shuffle result.
39756 // TODO: Relax this for single input faux shuffles (e.g. trunc).
39757 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
39758 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
39759 }))
39760 return SDValue();
39761 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39762 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39763 !isNullConstant(Op.getOperand(1))) {
39764 SDValue SrcVec = Op.getOperand(0);
39765 int ExtractIdx = Op.getConstantOperandVal(1);
39766 unsigned NumElts = VT.getVectorNumElements();
39767 OpInputs.assign({SrcVec});
39768 OpMask.assign(NumElts, SM_SentinelUndef);
39769 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
39770 OpZero = OpUndef = APInt::getZero(NumElts);
39771 } else {
39772 return SDValue();
39773 }
39774
39775 // If the shuffle result was smaller than the root, we need to adjust the
39776 // mask indices and pad the mask with undefs.
39777 if (RootSizeInBits > VT.getSizeInBits()) {
39778 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
39779 unsigned OpMaskSize = OpMask.size();
39780 if (OpInputs.size() > 1) {
39781 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
39782 for (int &M : OpMask) {
39783 if (M < 0)
39784 continue;
39785 int EltIdx = M % OpMaskSize;
39786 int OpIdx = M / OpMaskSize;
39787 M = (PaddedMaskSize * OpIdx) + EltIdx;
39788 }
39789 }
39790 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
39791 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
39792 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
39793 }
39794
39797
39798 // We don't need to merge masks if the root is empty.
39799 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
39800 if (EmptyRoot) {
39801 // Only resolve zeros if it will remove an input, otherwise we might end
39802 // up in an infinite loop.
39803 bool ResolveKnownZeros = true;
39804 if (!OpZero.isZero()) {
39805 APInt UsedInputs = APInt::getZero(OpInputs.size());
39806 for (int i = 0, e = OpMask.size(); i != e; ++i) {
39807 int M = OpMask[i];
39808 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
39809 continue;
39810 UsedInputs.setBit(M / OpMask.size());
39811 if (UsedInputs.isAllOnes()) {
39812 ResolveKnownZeros = false;
39813 break;
39814 }
39815 }
39816 }
39817 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
39818 ResolveKnownZeros);
39819
39820 Mask = OpMask;
39821 Ops.append(OpInputs.begin(), OpInputs.end());
39822 } else {
39823 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
39824
39825 // Add the inputs to the Ops list, avoiding duplicates.
39826 Ops.append(SrcOps.begin(), SrcOps.end());
39827
39828 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
39829 // Attempt to find an existing match.
39830 SDValue InputBC = peekThroughBitcasts(Input);
39831 for (int i = 0, e = Ops.size(); i < e; ++i)
39832 if (InputBC == peekThroughBitcasts(Ops[i]))
39833 return i;
39834 // Match failed - should we replace an existing Op?
39835 if (InsertionPoint >= 0) {
39836 Ops[InsertionPoint] = Input;
39837 return InsertionPoint;
39838 }
39839 // Add to the end of the Ops list.
39840 Ops.push_back(Input);
39841 return Ops.size() - 1;
39842 };
39843
39844 SmallVector<int, 2> OpInputIdx;
39845 for (SDValue OpInput : OpInputs)
39846 OpInputIdx.push_back(
39847 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
39848
39849 assert(((RootMask.size() > OpMask.size() &&
39850 RootMask.size() % OpMask.size() == 0) ||
39851 (OpMask.size() > RootMask.size() &&
39852 OpMask.size() % RootMask.size() == 0) ||
39853 OpMask.size() == RootMask.size()) &&
39854 "The smaller number of elements must divide the larger.");
39855
39856 // This function can be performance-critical, so we rely on the power-of-2
39857 // knowledge that we have about the mask sizes to replace div/rem ops with
39858 // bit-masks and shifts.
39859 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
39860 "Non-power-of-2 shuffle mask sizes");
39861 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
39862 "Non-power-of-2 shuffle mask sizes");
39863 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
39864 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
39865
39866 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
39867 unsigned RootRatio =
39868 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
39869 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
39870 assert((RootRatio == 1 || OpRatio == 1) &&
39871 "Must not have a ratio for both incoming and op masks!");
39872
39873 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
39874 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
39875 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
39876 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
39877 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
39878
39879 Mask.resize(MaskWidth, SM_SentinelUndef);
39880
39881 // Merge this shuffle operation's mask into our accumulated mask. Note that
39882 // this shuffle's mask will be the first applied to the input, followed by
39883 // the root mask to get us all the way to the root value arrangement. The
39884 // reason for this order is that we are recursing up the operation chain.
39885 for (unsigned i = 0; i < MaskWidth; ++i) {
39886 unsigned RootIdx = i >> RootRatioLog2;
39887 if (RootMask[RootIdx] < 0) {
39888 // This is a zero or undef lane, we're done.
39889 Mask[i] = RootMask[RootIdx];
39890 continue;
39891 }
39892
39893 unsigned RootMaskedIdx =
39894 RootRatio == 1
39895 ? RootMask[RootIdx]
39896 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
39897
39898 // Just insert the scaled root mask value if it references an input other
39899 // than the SrcOp we're currently inserting.
39900 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
39901 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
39902 Mask[i] = RootMaskedIdx;
39903 continue;
39904 }
39905
39906 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
39907 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
39908 if (OpMask[OpIdx] < 0) {
39909 // The incoming lanes are zero or undef, it doesn't matter which ones we
39910 // are using.
39911 Mask[i] = OpMask[OpIdx];
39912 continue;
39913 }
39914
39915 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
39916 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
39917 : (OpMask[OpIdx] << OpRatioLog2) +
39918 (RootMaskedIdx & (OpRatio - 1));
39919
39920 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
39921 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
39922 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
39923 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
39924
39925 Mask[i] = OpMaskedIdx;
39926 }
39927 }
39928
39929 // Peek through vector widenings and set out of bounds mask indices to undef.
39930 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
39931 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
39932 SDValue &Op = Ops[I];
39933 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
39934 isNullConstant(Op.getOperand(2))) {
39935 Op = Op.getOperand(1);
39936 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
39937 int Lo = I * Mask.size();
39938 int Hi = (I + 1) * Mask.size();
39939 int NewHi = Lo + (Mask.size() / Scale);
39940 for (int &M : Mask) {
39941 if (Lo <= M && NewHi <= M && M < Hi)
39942 M = SM_SentinelUndef;
39943 }
39944 }
39945 }
39946
39947 // Peek through any free extract_subvector nodes back to root size.
39948 for (SDValue &Op : Ops)
39949 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39950 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39951 isNullConstant(Op.getOperand(1)))
39952 Op = Op.getOperand(0);
39953
39954 // Remove unused/repeated shuffle source ops.
39956
39957 // Handle the all undef/zero/ones cases early.
39958 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
39959 return DAG.getUNDEF(RootVT);
39960 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
39961 return getZeroVector(RootVT, Subtarget, DAG, DL);
39962 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
39964 return getOnesVector(RootVT, DAG, DL);
39965
39966 assert(!Ops.empty() && "Shuffle with no inputs detected");
39967 HasVariableMask |= IsOpVariableMask;
39968
39969 // Update the list of shuffle nodes that have been combined so far.
39970 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
39971 SrcNodes.end());
39972 CombinedNodes.push_back(Op.getNode());
39973
39974 // See if we can recurse into each shuffle source op (if it's a target
39975 // shuffle). The source op should only be generally combined if it either has
39976 // a single use (i.e. current Op) or all its users have already been combined,
39977 // if not then we can still combine but should prevent generation of variable
39978 // shuffles to avoid constant pool bloat.
39979 // Don't recurse if we already have more source ops than we can combine in
39980 // the remaining recursion depth.
39981 if (Ops.size() < (MaxDepth - Depth)) {
39982 for (int i = 0, e = Ops.size(); i < e; ++i) {
39983 // For empty roots, we need to resolve zeroable elements before combining
39984 // them with other shuffles.
39985 SmallVector<int, 64> ResolvedMask = Mask;
39986 if (EmptyRoot)
39987 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
39988 bool AllowCrossLaneVar = false;
39989 bool AllowPerLaneVar = false;
39990 if (Ops[i].getNode()->hasOneUse() ||
39991 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
39992 AllowCrossLaneVar = AllowVariableCrossLaneMask;
39993 AllowPerLaneVar = AllowVariablePerLaneMask;
39994 }
39996 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
39997 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
39998 Subtarget))
39999 return Res;
40000 }
40001 }
40002
40003 // Attempt to constant fold all of the constant source ops.
40005 RootVT, Ops, Mask, HasVariableMask, DAG, DL, Subtarget))
40006 return Cst;
40007
40008 // If constant fold failed and we only have constants - then we have
40009 // multiple uses by a single non-variable shuffle - just bail.
40010 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
40011 APInt UndefElts;
40012 SmallVector<APInt> RawBits;
40013 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40014 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
40015 RawBits,
40016 /*AllowWholeUndefs*/ true,
40017 /*AllowPartialUndefs*/ true);
40018 })) {
40019 return SDValue();
40020 }
40021
40022 // Canonicalize the combined shuffle mask chain with horizontal ops.
40023 // NOTE: This will update the Ops and Mask.
40025 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
40026 return DAG.getBitcast(RootVT, HOp);
40027
40028 // Try to refine our inputs given our knowledge of target shuffle mask.
40029 for (auto I : enumerate(Ops)) {
40030 int OpIdx = I.index();
40031 SDValue &Op = I.value();
40032
40033 // What range of shuffle mask element values results in picking from Op?
40034 int Lo = OpIdx * Mask.size();
40035 int Hi = Lo + Mask.size();
40036
40037 // Which elements of Op do we demand, given the mask's granularity?
40038 APInt OpDemandedElts(Mask.size(), 0);
40039 for (int MaskElt : Mask) {
40040 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
40041 int OpEltIdx = MaskElt - Lo;
40042 OpDemandedElts.setBit(OpEltIdx);
40043 }
40044 }
40045
40046 // Is the shuffle result smaller than the root?
40047 if (Op.getValueSizeInBits() < RootSizeInBits) {
40048 // We padded the mask with undefs. But we now need to undo that.
40049 unsigned NumExpectedVectorElts = Mask.size();
40050 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
40051 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
40052 assert(!OpDemandedElts.extractBits(
40053 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
40054 "Demanding the virtual undef widening padding?");
40055 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
40056 }
40057
40058 // The Op itself may be of different VT, so we need to scale the mask.
40059 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
40060 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
40061
40062 // Can this operand be simplified any further, given it's demanded elements?
40063 if (SDValue NewOp =
40065 Op, OpScaledDemandedElts, DAG))
40066 Op = NewOp;
40067 }
40068 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
40069
40070 // Widen any subvector shuffle inputs we've collected.
40071 // TODO: Remove this to avoid generating temporary nodes, we should only
40072 // widen once combineX86ShuffleChain has found a match.
40073 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
40074 return Op.getValueSizeInBits() < RootSizeInBits;
40075 })) {
40076 for (SDValue &Op : Ops)
40077 if (Op.getValueSizeInBits() < RootSizeInBits)
40078 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
40079 RootSizeInBits);
40080 // Reresolve - we might have repeated subvector sources.
40082 }
40083
40084 // We can only combine unary and binary shuffle mask cases.
40085 if (Ops.size() <= 2) {
40086 // Minor canonicalization of the accumulated shuffle mask to make it easier
40087 // to match below. All this does is detect masks with sequential pairs of
40088 // elements, and shrink them to the half-width mask. It does this in a loop
40089 // so it will reduce the size of the mask to the minimal width mask which
40090 // performs an equivalent shuffle.
40091 while (Mask.size() > 1) {
40092 SmallVector<int, 64> WidenedMask;
40093 if (!canWidenShuffleElements(Mask, WidenedMask))
40094 break;
40095 Mask = std::move(WidenedMask);
40096 }
40097
40098 // Canonicalization of binary shuffle masks to improve pattern matching by
40099 // commuting the inputs.
40100 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
40102 std::swap(Ops[0], Ops[1]);
40103 }
40104
40105 // Try to combine into a single shuffle instruction.
40106 if (SDValue Shuffle = combineX86ShuffleChain(
40107 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
40108 AllowVariablePerLaneMask, DAG, Subtarget))
40109 return Shuffle;
40110
40111 // If all the operands come from the same larger vector, fallthrough and try
40112 // to use combineX86ShuffleChainWithExtract.
40115 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
40116 (RootSizeInBits / Mask.size()) != 64 ||
40117 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
40118 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
40119 LHS.getOperand(0) != RHS.getOperand(0))
40120 return SDValue();
40121 }
40122
40123 // If that failed and any input is extracted then try to combine as a
40124 // shuffle with the larger type.
40126 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
40127 AllowVariablePerLaneMask, DAG, Subtarget);
40128}
40129
40130/// Helper entry wrapper to combineX86ShufflesRecursively.
40132 const X86Subtarget &Subtarget) {
40134 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
40135 /*HasVarMask*/ false,
40136 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
40137 Subtarget);
40138}
40139
40140/// Get the PSHUF-style mask from PSHUF node.
40141///
40142/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
40143/// PSHUF-style masks that can be reused with such instructions.
40145 MVT VT = N.getSimpleValueType();
40148 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
40149 (void)HaveMask;
40150 assert(HaveMask);
40151
40152 // If we have more than 128-bits, only the low 128-bits of shuffle mask
40153 // matter. Check that the upper masks are repeats and remove them.
40154 if (VT.getSizeInBits() > 128) {
40155 int LaneElts = 128 / VT.getScalarSizeInBits();
40156#ifndef NDEBUG
40157 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
40158 for (int j = 0; j < LaneElts; ++j)
40159 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
40160 "Mask doesn't repeat in high 128-bit lanes!");
40161#endif
40162 Mask.resize(LaneElts);
40163 }
40164
40165 switch (N.getOpcode()) {
40166 case X86ISD::PSHUFD:
40167 return Mask;
40168 case X86ISD::PSHUFLW:
40169 Mask.resize(4);
40170 return Mask;
40171 case X86ISD::PSHUFHW:
40172 Mask.erase(Mask.begin(), Mask.begin() + 4);
40173 for (int &M : Mask)
40174 M -= 4;
40175 return Mask;
40176 default:
40177 llvm_unreachable("No valid shuffle instruction found!");
40178 }
40179}
40180
40181/// Search for a combinable shuffle across a chain ending in pshufd.
40182///
40183/// We walk up the chain and look for a combinable shuffle, skipping over
40184/// shuffles that we could hoist this shuffle's transformation past without
40185/// altering anything.
40188 const SDLoc &DL,
40189 SelectionDAG &DAG) {
40190 assert(N.getOpcode() == X86ISD::PSHUFD &&
40191 "Called with something other than an x86 128-bit half shuffle!");
40192
40193 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
40194 // of the shuffles in the chain so that we can form a fresh chain to replace
40195 // this one.
40197 SDValue V = N.getOperand(0);
40198 for (; V.hasOneUse(); V = V.getOperand(0)) {
40199 switch (V.getOpcode()) {
40200 default:
40201 return SDValue(); // Nothing combined!
40202
40203 case ISD::BITCAST:
40204 // Skip bitcasts as we always know the type for the target specific
40205 // instructions.
40206 continue;
40207
40208 case X86ISD::PSHUFD:
40209 // Found another dword shuffle.
40210 break;
40211
40212 case X86ISD::PSHUFLW:
40213 // Check that the low words (being shuffled) are the identity in the
40214 // dword shuffle, and the high words are self-contained.
40215 if (Mask[0] != 0 || Mask[1] != 1 ||
40216 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
40217 return SDValue();
40218
40219 Chain.push_back(V);
40220 continue;
40221
40222 case X86ISD::PSHUFHW:
40223 // Check that the high words (being shuffled) are the identity in the
40224 // dword shuffle, and the low words are self-contained.
40225 if (Mask[2] != 2 || Mask[3] != 3 ||
40226 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
40227 return SDValue();
40228
40229 Chain.push_back(V);
40230 continue;
40231
40232 case X86ISD::UNPCKL:
40233 case X86ISD::UNPCKH:
40234 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
40235 // shuffle into a preceding word shuffle.
40236 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
40237 V.getSimpleValueType().getVectorElementType() != MVT::i16)
40238 return SDValue();
40239
40240 // Search for a half-shuffle which we can combine with.
40241 unsigned CombineOp =
40242 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
40243 if (V.getOperand(0) != V.getOperand(1) ||
40244 !V->isOnlyUserOf(V.getOperand(0).getNode()))
40245 return SDValue();
40246 Chain.push_back(V);
40247 V = V.getOperand(0);
40248 do {
40249 switch (V.getOpcode()) {
40250 default:
40251 return SDValue(); // Nothing to combine.
40252
40253 case X86ISD::PSHUFLW:
40254 case X86ISD::PSHUFHW:
40255 if (V.getOpcode() == CombineOp)
40256 break;
40257
40258 Chain.push_back(V);
40259
40260 [[fallthrough]];
40261 case ISD::BITCAST:
40262 V = V.getOperand(0);
40263 continue;
40264 }
40265 break;
40266 } while (V.hasOneUse());
40267 break;
40268 }
40269 // Break out of the loop if we break out of the switch.
40270 break;
40271 }
40272
40273 if (!V.hasOneUse())
40274 // We fell out of the loop without finding a viable combining instruction.
40275 return SDValue();
40276
40277 // Merge this node's mask and our incoming mask.
40279 for (int &M : Mask)
40280 M = VMask[M];
40281 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
40282 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
40283
40284 // Rebuild the chain around this new shuffle.
40285 while (!Chain.empty()) {
40286 SDValue W = Chain.pop_back_val();
40287
40288 if (V.getValueType() != W.getOperand(0).getValueType())
40289 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
40290
40291 switch (W.getOpcode()) {
40292 default:
40293 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
40294
40295 case X86ISD::UNPCKL:
40296 case X86ISD::UNPCKH:
40297 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
40298 break;
40299
40300 case X86ISD::PSHUFD:
40301 case X86ISD::PSHUFLW:
40302 case X86ISD::PSHUFHW:
40303 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
40304 break;
40305 }
40306 }
40307 if (V.getValueType() != N.getValueType())
40308 V = DAG.getBitcast(N.getValueType(), V);
40309
40310 // Return the new chain to replace N.
40311 return V;
40312}
40313
40314// Attempt to commute shufps LHS loads:
40315// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
40317 SelectionDAG &DAG) {
40318 // TODO: Add vXf64 support.
40319 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
40320 return SDValue();
40321
40322 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
40323 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
40324 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
40325 return SDValue();
40326 SDValue N0 = V.getOperand(0);
40327 SDValue N1 = V.getOperand(1);
40328 unsigned Imm = V.getConstantOperandVal(2);
40329 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
40330 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
40332 return SDValue();
40333 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
40334 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
40335 DAG.getTargetConstant(Imm, DL, MVT::i8));
40336 };
40337
40338 switch (N.getOpcode()) {
40339 case X86ISD::VPERMILPI:
40340 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
40341 unsigned Imm = N.getConstantOperandVal(1);
40342 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
40343 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
40344 }
40345 break;
40346 case X86ISD::SHUFP: {
40347 SDValue N0 = N.getOperand(0);
40348 SDValue N1 = N.getOperand(1);
40349 unsigned Imm = N.getConstantOperandVal(2);
40350 if (N0 == N1) {
40351 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
40352 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
40353 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
40354 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
40355 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
40356 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
40357 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
40358 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
40359 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
40360 }
40361 break;
40362 }
40363 }
40364
40365 return SDValue();
40366}
40367
40368// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
40369// iff we don't demand the same element index for both X and Y.
40370static SDValue
40372 const APInt &DemandedElts, SelectionDAG &DAG,
40373 const X86Subtarget &Subtarget, const SDLoc &DL) {
40374 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
40375 if (!N0.hasOneUse() || !N1.hasOneUse())
40376 return SDValue();
40377
40378 unsigned NumElts = VT.getVectorNumElements();
40381
40382 // See if both operands are shuffles, and that we can scale the shuffle masks
40383 // to the same width as the blend mask.
40384 // TODO: Support SM_SentinelZero?
40385 SmallVector<SDValue, 2> Ops0, Ops1;
40386 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
40387 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
40388 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
40389 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
40390 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
40391 return SDValue();
40392
40393 // Determine the demanded elts from both permutes.
40394 APInt Demanded0, DemandedLHS0, DemandedRHS0;
40395 APInt Demanded1, DemandedLHS1, DemandedRHS1;
40396 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
40397 Demanded1,
40398 /*AllowUndefElts=*/true) ||
40399 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
40400 DemandedRHS0, /*AllowUndefElts=*/true) ||
40401 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
40402 DemandedRHS1, /*AllowUndefElts=*/true))
40403 return SDValue();
40404
40405 // Confirm that we only use a single operand from both permutes and that we
40406 // don't demand the same index from both.
40407 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
40408 DemandedLHS0.intersects(DemandedLHS1))
40409 return SDValue();
40410
40411 // Use the permute demanded elts masks as the new blend mask.
40412 // Create the new permute mask as a blend of the 2 original permute masks.
40413 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
40414 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
40415 for (unsigned I = 0; I != NumElts; ++I) {
40416 if (Demanded0[I]) {
40417 int M = ScaledMask0[I];
40418 if (0 <= M) {
40419 assert(isUndefOrEqual(NewBlendMask[M], M) &&
40420 "BlendMask demands LHS AND RHS");
40421 NewBlendMask[M] = M;
40422 NewPermuteMask[I] = M;
40423 }
40424 } else if (Demanded1[I]) {
40425 int M = ScaledMask1[I];
40426 if (0 <= M) {
40427 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
40428 "BlendMask demands LHS AND RHS");
40429 NewBlendMask[M] = M + NumElts;
40430 NewPermuteMask[I] = M;
40431 }
40432 }
40433 }
40434 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
40435 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
40436
40437 // v16i16 shuffles can explode in complexity very easily, only accept them if
40438 // the blend mask is the same in the 128-bit subvectors (or can widen to
40439 // v8i32) and the permute can be widened as well.
40440 if (VT == MVT::v16i16) {
40441 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
40442 !canWidenShuffleElements(NewBlendMask))
40443 return SDValue();
40444 if (!canWidenShuffleElements(NewPermuteMask))
40445 return SDValue();
40446 }
40447
40448 // Don't introduce lane-crossing permutes without AVX2, unless it can be
40449 // widened to a lane permute (vperm2f128).
40450 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
40452 NewPermuteMask) &&
40453 !canScaleShuffleElements(NewPermuteMask, 2))
40454 return SDValue();
40455
40456 SDValue NewBlend =
40457 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
40458 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
40459 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
40460 NewPermuteMask);
40461}
40462
40463// TODO - move this to TLI like isBinOp?
40464static bool isUnaryOp(unsigned Opcode) {
40465 switch (Opcode) {
40466 case ISD::CTLZ:
40467 case ISD::CTTZ:
40468 case ISD::CTPOP:
40469 return true;
40470 }
40471 return false;
40472}
40473
40474// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
40475// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40477 const SDLoc &DL) {
40478 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40479 EVT ShuffleVT = N.getValueType();
40480 unsigned Opc = N.getOpcode();
40481
40482 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true,
40483 bool FoldLoad = false) {
40484 // AllZeros/AllOnes constants are freely shuffled and will peek through
40485 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
40486 // merge with target shuffles if it has one use so shuffle combining is
40487 // likely to kick in. Shuffles of splats are expected to be removed.
40488 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
40489 ISD::isBuildVectorAllZeros(Op.getNode()) ||
40492 getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
40493 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
40494 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
40495 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
40496 (FoldLoad && isShuffleFoldableLoad(Op)) ||
40497 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
40498 };
40499 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
40500 // Ensure we only shuffle whole vector src elements, unless its a logical
40501 // binops where we can more aggressively move shuffles from dst to src.
40502 return isLogicOp(BinOp) ||
40503 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
40504 };
40505
40506 switch (Opc) {
40507 // Unary and Unary+Permute Shuffles.
40508 case X86ISD::PSHUFB: {
40509 // Don't merge PSHUFB if it contains zero'd elements.
40510 SmallVector<int> Mask;
40512 if (!getTargetShuffleMask(N, false, Ops, Mask))
40513 break;
40514 [[fallthrough]];
40515 }
40516 case X86ISD::VBROADCAST:
40517 case X86ISD::MOVDDUP:
40518 case X86ISD::PSHUFD:
40519 case X86ISD::PSHUFHW:
40520 case X86ISD::PSHUFLW:
40521 case X86ISD::VPERMI:
40522 case X86ISD::VPERMILPI: {
40523 if (N.getOperand(0).getValueType() == ShuffleVT &&
40524 N->isOnlyUserOf(N.getOperand(0).getNode())) {
40525 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40526 unsigned SrcOpcode = N0.getOpcode();
40527 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
40530 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::VPERMI,
40531 Opc != X86ISD::PSHUFB) ||
40532 IsMergeableWithShuffle(Op01, Opc != X86ISD::VPERMI,
40533 Opc != X86ISD::PSHUFB)) {
40534 SDValue LHS, RHS;
40535 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40536 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40537 if (N.getNumOperands() == 2) {
40538 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
40539 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
40540 } else {
40541 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
40542 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
40543 }
40544 EVT OpVT = N0.getValueType();
40545 return DAG.getBitcast(ShuffleVT,
40546 DAG.getNode(SrcOpcode, DL, OpVT,
40547 DAG.getBitcast(OpVT, LHS),
40548 DAG.getBitcast(OpVT, RHS)));
40549 }
40550 }
40551 }
40552 break;
40553 }
40554 // Binary and Binary+Permute Shuffles.
40555 case X86ISD::INSERTPS: {
40556 // Don't merge INSERTPS if it contains zero'd elements.
40557 unsigned InsertPSMask = N.getConstantOperandVal(2);
40558 unsigned ZeroMask = InsertPSMask & 0xF;
40559 if (ZeroMask != 0)
40560 break;
40561 [[fallthrough]];
40562 }
40563 case X86ISD::MOVSD:
40564 case X86ISD::MOVSS:
40565 case X86ISD::BLENDI:
40566 case X86ISD::SHUFP:
40567 case X86ISD::UNPCKH:
40568 case X86ISD::UNPCKL: {
40569 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
40570 N->isOnlyUserOf(N.getOperand(1).getNode())) {
40571 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40572 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
40573 unsigned SrcOpcode = N0.getOpcode();
40574 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40575 N0.getValueType() == N1.getValueType() &&
40576 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40577 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40582 // Ensure the total number of shuffles doesn't increase by folding this
40583 // shuffle through to the source ops.
40584 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
40585 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
40586 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
40587 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
40588 SDValue LHS, RHS;
40589 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40590 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40591 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40592 Op11 = DAG.getBitcast(ShuffleVT, Op11);
40593 if (N.getNumOperands() == 3) {
40594 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40595 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
40596 } else {
40597 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40598 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
40599 }
40600 EVT OpVT = N0.getValueType();
40601 return DAG.getBitcast(ShuffleVT,
40602 DAG.getNode(SrcOpcode, DL, OpVT,
40603 DAG.getBitcast(OpVT, LHS),
40604 DAG.getBitcast(OpVT, RHS)));
40605 }
40606 }
40607 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40608 N0.getValueType() == N1.getValueType() &&
40609 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40610 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40613 SDValue Res;
40614 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40615 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40616 if (N.getNumOperands() == 3) {
40617 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40618 } else {
40619 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40620 }
40621 EVT OpVT = N0.getValueType();
40622 return DAG.getBitcast(
40623 ShuffleVT,
40624 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
40625 }
40626 }
40627 break;
40628 }
40629 }
40630 return SDValue();
40631}
40632
40633/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
40635 SelectionDAG &DAG,
40636 const SDLoc &DL) {
40637 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
40638
40639 MVT VT = V.getSimpleValueType();
40640 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
40641 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
40642 unsigned SrcOpc0 = Src0.getOpcode();
40643 unsigned SrcOpc1 = Src1.getOpcode();
40644 EVT SrcVT0 = Src0.getValueType();
40645 EVT SrcVT1 = Src1.getValueType();
40646
40647 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
40648 return SDValue();
40649
40650 switch (SrcOpc0) {
40651 case X86ISD::MOVDDUP: {
40652 SDValue LHS = Src0.getOperand(0);
40653 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40654 SDValue Res =
40655 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
40656 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
40657 return DAG.getBitcast(VT, Res);
40658 }
40659 case X86ISD::VPERMILPI:
40660 // TODO: Handle v4f64 permutes with different low/high lane masks.
40661 if (SrcVT0 == MVT::v4f64) {
40662 uint64_t Mask = Src0.getConstantOperandVal(1);
40663 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
40664 break;
40665 }
40666 [[fallthrough]];
40667 case X86ISD::VSHLI:
40668 case X86ISD::VSRLI:
40669 case X86ISD::VSRAI:
40670 case X86ISD::PSHUFD:
40671 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
40672 SDValue LHS = Src0.getOperand(0);
40673 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40674 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
40675 V.getOperand(2));
40676 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
40677 return DAG.getBitcast(VT, Res);
40678 }
40679 break;
40680 }
40681
40682 return SDValue();
40683}
40684
40685/// Try to combine x86 target specific shuffles.
40687 SelectionDAG &DAG,
40689 const X86Subtarget &Subtarget) {
40690 MVT VT = N.getSimpleValueType();
40692 unsigned Opcode = N.getOpcode();
40693 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40694
40695 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
40696 return R;
40697
40698 // Handle specific target shuffles.
40699 switch (Opcode) {
40700 case X86ISD::MOVDDUP: {
40701 SDValue Src = N.getOperand(0);
40702 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
40703 if (VT == MVT::v2f64 && Src.hasOneUse() &&
40704 ISD::isNormalLoad(Src.getNode())) {
40705 LoadSDNode *LN = cast<LoadSDNode>(Src);
40706 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
40707 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
40708 DCI.CombineTo(N.getNode(), Movddup);
40709 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40711 return N; // Return N so it doesn't get rechecked!
40712 }
40713 }
40714
40715 return SDValue();
40716 }
40717 case X86ISD::VBROADCAST: {
40718 SDValue Src = N.getOperand(0);
40719 SDValue BC = peekThroughBitcasts(Src);
40720 EVT SrcVT = Src.getValueType();
40721 EVT BCVT = BC.getValueType();
40722
40723 // If broadcasting from another shuffle, attempt to simplify it.
40724 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
40725 if (isTargetShuffle(BC.getOpcode()) &&
40726 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
40727 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
40728 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
40730 for (unsigned i = 0; i != Scale; ++i)
40731 DemandedMask[i] = i;
40733 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
40735 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
40736 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
40737 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40738 DAG.getBitcast(SrcVT, Res));
40739 }
40740
40741 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
40742 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
40743 if (Src.getOpcode() == ISD::BITCAST &&
40744 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
40745 TLI.isTypeLegal(BCVT) &&
40747 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
40748 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
40750 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40751 }
40752
40753 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
40754 // If we're re-broadcasting a smaller type then broadcast with that type and
40755 // bitcast.
40756 // TODO: Do this for any splat?
40757 if (Src.getOpcode() == ISD::BITCAST &&
40758 (BC.getOpcode() == X86ISD::VBROADCAST ||
40760 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
40761 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
40762 MVT NewVT =
40764 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
40765 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40766 }
40767
40768 // Reduce broadcast source vector to lowest 128-bits.
40769 if (SrcVT.getSizeInBits() > 128)
40770 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40771 extract128BitVector(Src, 0, DAG, DL));
40772
40773 // broadcast(scalar_to_vector(x)) -> broadcast(x).
40774 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40775 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
40776 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40777
40778 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
40779 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
40780 isNullConstant(Src.getOperand(1)) &&
40781 Src.getValueType() ==
40782 Src.getOperand(0).getValueType().getScalarType() &&
40783 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
40784 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40785
40786 // Share broadcast with the longest vector and extract low subvector (free).
40787 // Ensure the same SDValue from the SDNode use is being used.
40788 for (SDNode *User : Src->uses())
40789 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
40790 Src == User->getOperand(0) &&
40791 User->getValueSizeInBits(0).getFixedValue() >
40792 VT.getFixedSizeInBits()) {
40793 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
40794 VT.getSizeInBits());
40795 }
40796
40797 // vbroadcast(scalarload X) -> vbroadcast_load X
40798 // For float loads, extract other uses of the scalar from the broadcast.
40799 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
40800 ISD::isNormalLoad(Src.getNode())) {
40801 LoadSDNode *LN = cast<LoadSDNode>(Src);
40802 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40803 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40804 SDValue BcastLd =
40806 LN->getMemoryVT(), LN->getMemOperand());
40807 // If the load value is used only by N, replace it via CombineTo N.
40808 bool NoReplaceExtract = Src.hasOneUse();
40809 DCI.CombineTo(N.getNode(), BcastLd);
40810 if (NoReplaceExtract) {
40811 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40813 } else {
40814 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
40815 DAG.getIntPtrConstant(0, DL));
40816 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
40817 }
40818 return N; // Return N so it doesn't get rechecked!
40819 }
40820
40821 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
40822 // i16. So shrink it ourselves if we can make a broadcast_load.
40823 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
40824 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
40825 assert(Subtarget.hasAVX2() && "Expected AVX2");
40826 SDValue TruncIn = Src.getOperand(0);
40827
40828 // If this is a truncate of a non extending load we can just narrow it to
40829 // use a broadcast_load.
40830 if (ISD::isNormalLoad(TruncIn.getNode())) {
40831 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
40832 // Unless its volatile or atomic.
40833 if (LN->isSimple()) {
40834 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40835 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40836 SDValue BcastLd = DAG.getMemIntrinsicNode(
40837 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40838 LN->getPointerInfo(), LN->getOriginalAlign(),
40839 LN->getMemOperand()->getFlags());
40840 DCI.CombineTo(N.getNode(), BcastLd);
40841 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40842 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40843 return N; // Return N so it doesn't get rechecked!
40844 }
40845 }
40846
40847 // If this is a truncate of an i16 extload, we can directly replace it.
40848 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
40849 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
40850 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
40851 if (LN->getMemoryVT().getSizeInBits() == 16) {
40852 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40853 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40854 SDValue BcastLd =
40856 LN->getMemoryVT(), LN->getMemOperand());
40857 DCI.CombineTo(N.getNode(), BcastLd);
40858 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40859 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40860 return N; // Return N so it doesn't get rechecked!
40861 }
40862 }
40863
40864 // If this is a truncate of load that has been shifted right, we can
40865 // offset the pointer and use a narrower load.
40866 if (TruncIn.getOpcode() == ISD::SRL &&
40867 TruncIn.getOperand(0).hasOneUse() &&
40868 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
40869 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
40870 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
40871 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
40872 // Make sure the shift amount and the load size are divisible by 16.
40873 // Don't do this if the load is volatile or atomic.
40874 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
40875 LN->isSimple()) {
40876 unsigned Offset = ShiftAmt / 8;
40877 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40880 SDValue Ops[] = { LN->getChain(), Ptr };
40881 SDValue BcastLd = DAG.getMemIntrinsicNode(
40882 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40884 LN->getOriginalAlign(),
40885 LN->getMemOperand()->getFlags());
40886 DCI.CombineTo(N.getNode(), BcastLd);
40887 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40888 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40889 return N; // Return N so it doesn't get rechecked!
40890 }
40891 }
40892 }
40893
40894 // vbroadcast(vzload X) -> vbroadcast_load X
40895 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
40896 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
40897 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
40898 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40899 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40900 SDValue BcastLd =
40902 LN->getMemoryVT(), LN->getMemOperand());
40903 DCI.CombineTo(N.getNode(), BcastLd);
40904 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40906 return N; // Return N so it doesn't get rechecked!
40907 }
40908 }
40909
40910 // vbroadcast(vector load X) -> vbroadcast_load
40911 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
40912 SrcVT == MVT::v4i32) &&
40913 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
40914 LoadSDNode *LN = cast<LoadSDNode>(Src);
40915 // Unless the load is volatile or atomic.
40916 if (LN->isSimple()) {
40917 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40918 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40919 SDValue BcastLd = DAG.getMemIntrinsicNode(
40920 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
40921 LN->getPointerInfo(), LN->getOriginalAlign(),
40922 LN->getMemOperand()->getFlags());
40923 DCI.CombineTo(N.getNode(), BcastLd);
40924 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40926 return N; // Return N so it doesn't get rechecked!
40927 }
40928 }
40929
40930 return SDValue();
40931 }
40932 case X86ISD::VZEXT_MOVL: {
40933 SDValue N0 = N.getOperand(0);
40934
40935 // If this a vzmovl of a full vector load, replace it with a vzload, unless
40936 // the load is volatile.
40937 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
40938 auto *LN = cast<LoadSDNode>(N0);
40939 if (SDValue VZLoad =
40940 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
40941 DCI.CombineTo(N.getNode(), VZLoad);
40942 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40944 return N;
40945 }
40946 }
40947
40948 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
40949 // and can just use a VZEXT_LOAD.
40950 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
40951 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
40952 auto *LN = cast<MemSDNode>(N0);
40953 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
40954 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40955 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40956 SDValue VZLoad =
40958 LN->getMemoryVT(), LN->getMemOperand());
40959 DCI.CombineTo(N.getNode(), VZLoad);
40960 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40962 return N;
40963 }
40964 }
40965
40966 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
40967 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
40968 // if the upper bits of the i64 are zero.
40969 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40970 N0.getOperand(0).hasOneUse() &&
40971 N0.getOperand(0).getValueType() == MVT::i64) {
40972 SDValue In = N0.getOperand(0);
40973 APInt Mask = APInt::getHighBitsSet(64, 32);
40974 if (DAG.MaskedValueIsZero(In, Mask)) {
40975 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
40976 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
40977 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
40978 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
40979 return DAG.getBitcast(VT, Movl);
40980 }
40981 }
40982
40983 // Load a scalar integer constant directly to XMM instead of transferring an
40984 // immediate value from GPR.
40985 // vzext_movl (scalar_to_vector C) --> load [C,0...]
40986 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40987 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
40988 // Create a vector constant - scalar constant followed by zeros.
40989 EVT ScalarVT = N0.getOperand(0).getValueType();
40990 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
40991 unsigned NumElts = VT.getVectorNumElements();
40992 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
40993 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
40994 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
40995
40996 // Load the vector constant from constant pool.
40997 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
40998 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
40999 MachinePointerInfo MPI =
41001 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
41002 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
41004 }
41005 }
41006
41007 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
41008 // insert into a zero vector. This helps get VZEXT_MOVL closer to
41009 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
41010 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
41011 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
41013
41014 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
41015 isNullConstant(V.getOperand(2))) {
41016 SDValue In = V.getOperand(1);
41018 In.getValueSizeInBits() /
41019 VT.getScalarSizeInBits());
41020 In = DAG.getBitcast(SubVT, In);
41021 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
41022 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
41023 getZeroVector(VT, Subtarget, DAG, DL), Movl,
41024 V.getOperand(2));
41025 }
41026 }
41027
41028 return SDValue();
41029 }
41030 case X86ISD::BLENDI: {
41031 SDValue N0 = N.getOperand(0);
41032 SDValue N1 = N.getOperand(1);
41033 unsigned EltBits = VT.getScalarSizeInBits();
41034
41035 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
41036 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
41037 // TODO: Handle MVT::v16i16 repeated blend mask.
41038 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
41039 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
41040 unsigned SrcBits = SrcVT.getScalarSizeInBits();
41041 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
41042 unsigned Size = VT.getVectorNumElements();
41043 unsigned NewSize = SrcVT.getVectorNumElements();
41044 APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
41045 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
41046 return DAG.getBitcast(
41047 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
41048 N1.getOperand(0),
41049 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
41050 DL, MVT::i8)));
41051 }
41052 }
41053 // Share PSHUFB masks:
41054 // blend(pshufb(x,m1),pshufb(y,m2))
41055 // --> m3 = blend(m1,m2)
41056 // blend(pshufb(x,m3),pshufb(y,m3))
41057 if (N0.hasOneUse() && N1.hasOneUse()) {
41058 SmallVector<int> Mask, ByteMask;
41062 if (LHS.getOpcode() == X86ISD::PSHUFB &&
41063 RHS.getOpcode() == X86ISD::PSHUFB &&
41064 LHS.getOperand(1) != RHS.getOperand(1) &&
41065 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
41066 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
41067 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
41068 RHS == peekThroughOneUseBitcasts(Ops[1]) &&
41069 "BLENDI decode mismatch");
41070 MVT ShufVT = LHS.getSimpleValueType();
41071 SDValue MaskLHS = LHS.getOperand(1);
41072 SDValue MaskRHS = RHS.getOperand(1);
41073 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
41075 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
41076 /*HasVariableMask=*/true, DAG, DL, Subtarget)) {
41077 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
41078 LHS.getOperand(0), NewMask);
41079 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
41080 RHS.getOperand(0), NewMask);
41081 return DAG.getNode(X86ISD::BLENDI, DL, VT,
41082 DAG.getBitcast(VT, NewLHS),
41083 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
41084 }
41085 }
41086 }
41087 }
41088 return SDValue();
41089 }
41090 case X86ISD::SHUFP: {
41091 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
41092 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
41093 // TODO: Support types other than v4f32.
41094 if (VT == MVT::v4f32) {
41095 bool Updated = false;
41096 SmallVector<int> Mask;
41098 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
41099 for (int i = 0; i != 2; ++i) {
41100 SmallVector<SDValue> SubOps;
41101 SmallVector<int> SubMask, SubScaledMask;
41102 SDValue Sub = peekThroughBitcasts(Ops[i]);
41103 // TODO: Scaling might be easier if we specify the demanded elts.
41104 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
41105 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
41106 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
41107 int Ofs = i * 2;
41108 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
41109 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
41110 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
41111 Updated = true;
41112 }
41113 }
41114 }
41115 if (Updated) {
41116 for (int &M : Mask)
41117 M %= 4;
41118 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41119 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
41120 }
41121 }
41122 return SDValue();
41123 }
41124 case X86ISD::VPERMI: {
41125 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
41126 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
41127 SDValue N0 = N.getOperand(0);
41128 SDValue N1 = N.getOperand(1);
41129 unsigned EltSizeInBits = VT.getScalarSizeInBits();
41130 if (N0.getOpcode() == ISD::BITCAST &&
41131 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
41132 SDValue Src = N0.getOperand(0);
41133 EVT SrcVT = Src.getValueType();
41134 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
41135 return DAG.getBitcast(VT, Res);
41136 }
41137 return SDValue();
41138 }
41139 case X86ISD::SHUF128: {
41140 // If we're permuting the upper 256-bits subvectors of a concatenation, then
41141 // see if we can peek through and access the subvector directly.
41142 if (VT.is512BitVector()) {
41143 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
41144 // upper subvector is used.
41145 SDValue LHS = N->getOperand(0);
41146 SDValue RHS = N->getOperand(1);
41147 uint64_t Mask = N->getConstantOperandVal(2);
41148 SmallVector<SDValue> LHSOps, RHSOps;
41149 SDValue NewLHS, NewRHS;
41150 if ((Mask & 0x0A) == 0x0A &&
41151 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
41152 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
41153 Mask &= ~0x0A;
41154 }
41155 if ((Mask & 0xA0) == 0xA0 &&
41156 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
41157 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
41158 Mask &= ~0xA0;
41159 }
41160 if (NewLHS || NewRHS)
41161 return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS,
41162 NewRHS ? NewRHS : RHS,
41163 DAG.getTargetConstant(Mask, DL, MVT::i8));
41164 }
41165 return SDValue();
41166 }
41167 case X86ISD::VPERM2X128: {
41168 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
41169 SDValue LHS = N->getOperand(0);
41170 SDValue RHS = N->getOperand(1);
41171 if (LHS.getOpcode() == ISD::BITCAST &&
41172 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
41173 EVT SrcVT = LHS.getOperand(0).getValueType();
41174 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
41175 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
41176 DAG.getBitcast(SrcVT, LHS),
41177 DAG.getBitcast(SrcVT, RHS),
41178 N->getOperand(2)));
41179 }
41180 }
41181
41182 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
41184 return Res;
41185
41186 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
41187 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
41188 auto FindSubVector128 = [&](unsigned Idx) {
41189 if (Idx > 3)
41190 return SDValue();
41191 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
41192 SmallVector<SDValue> SubOps;
41193 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
41194 return SubOps[Idx & 1];
41195 unsigned NumElts = Src.getValueType().getVectorNumElements();
41196 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
41197 Src.getOperand(1).getValueSizeInBits() == 128 &&
41198 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
41199 return Src.getOperand(1);
41200 }
41201 return SDValue();
41202 };
41203 unsigned Imm = N.getConstantOperandVal(2);
41204 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
41205 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
41206 MVT SubVT = VT.getHalfNumVectorElementsVT();
41207 SubLo = DAG.getBitcast(SubVT, SubLo);
41208 SubHi = DAG.getBitcast(SubVT, SubHi);
41209 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
41210 }
41211 }
41212 return SDValue();
41213 }
41214 case X86ISD::PSHUFD:
41215 case X86ISD::PSHUFLW:
41216 case X86ISD::PSHUFHW: {
41217 SDValue N0 = N.getOperand(0);
41218 SDValue N1 = N.getOperand(1);
41219 if (N0->hasOneUse()) {
41221 switch (V.getOpcode()) {
41222 case X86ISD::VSHL:
41223 case X86ISD::VSRL:
41224 case X86ISD::VSRA:
41225 case X86ISD::VSHLI:
41226 case X86ISD::VSRLI:
41227 case X86ISD::VSRAI:
41228 case X86ISD::VROTLI:
41229 case X86ISD::VROTRI: {
41230 MVT InnerVT = V.getSimpleValueType();
41231 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
41232 SDValue Res = DAG.getNode(Opcode, DL, VT,
41233 DAG.getBitcast(VT, V.getOperand(0)), N1);
41234 Res = DAG.getBitcast(InnerVT, Res);
41235 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
41236 return DAG.getBitcast(VT, Res);
41237 }
41238 break;
41239 }
41240 }
41241 }
41242
41243 Mask = getPSHUFShuffleMask(N);
41244 assert(Mask.size() == 4);
41245 break;
41246 }
41247 case X86ISD::MOVSD:
41248 case X86ISD::MOVSH:
41249 case X86ISD::MOVSS: {
41250 SDValue N0 = N.getOperand(0);
41251 SDValue N1 = N.getOperand(1);
41252
41253 // Canonicalize scalar FPOps:
41254 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
41255 // If commutable, allow OP(N1[0], N0[0]).
41256 unsigned Opcode1 = N1.getOpcode();
41257 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
41258 Opcode1 == ISD::FDIV) {
41259 SDValue N10 = N1.getOperand(0);
41260 SDValue N11 = N1.getOperand(1);
41261 if (N10 == N0 ||
41262 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
41263 if (N10 != N0)
41264 std::swap(N10, N11);
41265 MVT SVT = VT.getVectorElementType();
41266 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
41267 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
41268 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
41269 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
41270 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
41271 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
41272 }
41273 }
41274
41275 return SDValue();
41276 }
41277 case X86ISD::INSERTPS: {
41278 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
41279 SDValue Op0 = N.getOperand(0);
41280 SDValue Op1 = N.getOperand(1);
41281 unsigned InsertPSMask = N.getConstantOperandVal(2);
41282 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
41283 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
41284 unsigned ZeroMask = InsertPSMask & 0xF;
41285
41286 // If we zero out all elements from Op0 then we don't need to reference it.
41287 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
41288 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
41289 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41290
41291 // If we zero out the element from Op1 then we don't need to reference it.
41292 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
41293 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
41294 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41295
41296 // Attempt to merge insertps Op1 with an inner target shuffle node.
41297 SmallVector<int, 8> TargetMask1;
41299 APInt KnownUndef1, KnownZero1;
41300 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
41301 KnownZero1)) {
41302 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
41303 // Zero/UNDEF insertion - zero out element and remove dependency.
41304 InsertPSMask |= (1u << DstIdx);
41305 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
41306 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41307 }
41308 // Update insertps mask srcidx and reference the source input directly.
41309 int M = TargetMask1[SrcIdx];
41310 assert(0 <= M && M < 8 && "Shuffle index out of range");
41311 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
41312 Op1 = Ops1[M < 4 ? 0 : 1];
41313 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
41314 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41315 }
41316
41317 // Attempt to merge insertps Op0 with an inner target shuffle node.
41318 SmallVector<int, 8> TargetMask0;
41320 APInt KnownUndef0, KnownZero0;
41321 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
41322 KnownZero0)) {
41323 bool Updated = false;
41324 bool UseInput00 = false;
41325 bool UseInput01 = false;
41326 for (int i = 0; i != 4; ++i) {
41327 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
41328 // No change if element is already zero or the inserted element.
41329 continue;
41330 }
41331
41332 if (KnownUndef0[i] || KnownZero0[i]) {
41333 // If the target mask is undef/zero then we must zero the element.
41334 InsertPSMask |= (1u << i);
41335 Updated = true;
41336 continue;
41337 }
41338
41339 // The input vector element must be inline.
41340 int M = TargetMask0[i];
41341 if (M != i && M != (i + 4))
41342 return SDValue();
41343
41344 // Determine which inputs of the target shuffle we're using.
41345 UseInput00 |= (0 <= M && M < 4);
41346 UseInput01 |= (4 <= M);
41347 }
41348
41349 // If we're not using both inputs of the target shuffle then use the
41350 // referenced input directly.
41351 if (UseInput00 && !UseInput01) {
41352 Updated = true;
41353 Op0 = Ops0[0];
41354 } else if (!UseInput00 && UseInput01) {
41355 Updated = true;
41356 Op0 = Ops0[1];
41357 }
41358
41359 if (Updated)
41360 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
41361 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41362 }
41363
41364 // If we're inserting an element from a vbroadcast load, fold the
41365 // load into the X86insertps instruction. We need to convert the scalar
41366 // load to a vector and clear the source lane of the INSERTPS control.
41367 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
41368 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
41369 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
41370 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
41371 MemIntr->getBasePtr(),
41372 MemIntr->getMemOperand());
41373 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
41375 Load),
41376 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
41377 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
41378 return Insert;
41379 }
41380 }
41381
41382 return SDValue();
41383 }
41384 case X86ISD::VPERMV3: {
41385 // Combine VPERMV3 to widened VPERMV if the two source operands are split
41386 // from the same vector.
41387 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
41388 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
41389 MVT SVT = V1.getSimpleValueType();
41390 if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41391 V1.getConstantOperandVal(1) == 0 &&
41392 V2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41393 V2.getConstantOperandVal(1) == SVT.getVectorNumElements() &&
41394 V1.getOperand(0) == V2.getOperand(0)) {
41395 EVT NVT = V1.getOperand(0).getValueType();
41396 if (NVT.is256BitVector() ||
41397 (NVT.is512BitVector() && Subtarget.hasEVEX512())) {
41398 MVT WideVT = MVT::getVectorVT(
41400 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
41401 DL, WideVT.getSizeInBits());
41402 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask,
41403 DAG.getBitcast(WideVT, V1.getOperand(0)));
41404 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
41405 DAG.getIntPtrConstant(0, DL));
41406 }
41407 }
41408 return SDValue();
41409 }
41410 default:
41411 return SDValue();
41412 }
41413
41414 // Nuke no-op shuffles that show up after combining.
41415 if (isNoopShuffleMask(Mask))
41416 return N.getOperand(0);
41417
41418 // Look for simplifications involving one or two shuffle instructions.
41419 SDValue V = N.getOperand(0);
41420 switch (N.getOpcode()) {
41421 default:
41422 break;
41423 case X86ISD::PSHUFLW:
41424 case X86ISD::PSHUFHW:
41425 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
41426
41427 // See if this reduces to a PSHUFD which is no more expensive and can
41428 // combine with more operations. Note that it has to at least flip the
41429 // dwords as otherwise it would have been removed as a no-op.
41430 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
41431 int DMask[] = {0, 1, 2, 3};
41432 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
41433 DMask[DOffset + 0] = DOffset + 1;
41434 DMask[DOffset + 1] = DOffset + 0;
41435 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
41436 V = DAG.getBitcast(DVT, V);
41437 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
41438 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
41439 return DAG.getBitcast(VT, V);
41440 }
41441
41442 // Look for shuffle patterns which can be implemented as a single unpack.
41443 // FIXME: This doesn't handle the location of the PSHUFD generically, and
41444 // only works when we have a PSHUFD followed by two half-shuffles.
41445 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
41446 (V.getOpcode() == X86ISD::PSHUFLW ||
41447 V.getOpcode() == X86ISD::PSHUFHW) &&
41448 V.getOpcode() != N.getOpcode() &&
41449 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
41450 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
41451 if (D.getOpcode() == X86ISD::PSHUFD) {
41454 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
41455 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
41456 int WordMask[8];
41457 for (int i = 0; i < 4; ++i) {
41458 WordMask[i + NOffset] = Mask[i] + NOffset;
41459 WordMask[i + VOffset] = VMask[i] + VOffset;
41460 }
41461 // Map the word mask through the DWord mask.
41462 int MappedMask[8];
41463 for (int i = 0; i < 8; ++i)
41464 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
41465 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
41466 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
41467 // We can replace all three shuffles with an unpack.
41468 V = DAG.getBitcast(VT, D.getOperand(0));
41469 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
41471 DL, VT, V, V);
41472 }
41473 }
41474 }
41475
41476 break;
41477
41478 case X86ISD::PSHUFD:
41479 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
41480 return NewN;
41481
41482 break;
41483 }
41484
41485 return SDValue();
41486}
41487
41488/// Checks if the shuffle mask takes subsequent elements
41489/// alternately from two vectors.
41490/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
41491static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
41492
41493 int ParitySrc[2] = {-1, -1};
41494 unsigned Size = Mask.size();
41495 for (unsigned i = 0; i != Size; ++i) {
41496 int M = Mask[i];
41497 if (M < 0)
41498 continue;
41499
41500 // Make sure we are using the matching element from the input.
41501 if ((M % Size) != i)
41502 return false;
41503
41504 // Make sure we use the same input for all elements of the same parity.
41505 int Src = M / Size;
41506 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
41507 return false;
41508 ParitySrc[i % 2] = Src;
41509 }
41510
41511 // Make sure each input is used.
41512 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
41513 return false;
41514
41515 Op0Even = ParitySrc[0] == 0;
41516 return true;
41517}
41518
41519/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
41520/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
41521/// are written to the parameters \p Opnd0 and \p Opnd1.
41522///
41523/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
41524/// so it is easier to generically match. We also insert dummy vector shuffle
41525/// nodes for the operands which explicitly discard the lanes which are unused
41526/// by this operation to try to flow through the rest of the combiner
41527/// the fact that they're unused.
41528static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
41529 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
41530 bool &IsSubAdd) {
41531
41532 EVT VT = N->getValueType(0);
41533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41534 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
41536 return false;
41537
41538 // We only handle target-independent shuffles.
41539 // FIXME: It would be easy and harmless to use the target shuffle mask
41540 // extraction tool to support more.
41541 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41542 return false;
41543
41544 SDValue V1 = N->getOperand(0);
41545 SDValue V2 = N->getOperand(1);
41546
41547 // Make sure we have an FADD and an FSUB.
41548 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
41549 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
41550 V1.getOpcode() == V2.getOpcode())
41551 return false;
41552
41553 // If there are other uses of these operations we can't fold them.
41554 if (!V1->hasOneUse() || !V2->hasOneUse())
41555 return false;
41556
41557 // Ensure that both operations have the same operands. Note that we can
41558 // commute the FADD operands.
41559 SDValue LHS, RHS;
41560 if (V1.getOpcode() == ISD::FSUB) {
41561 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
41562 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
41563 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
41564 return false;
41565 } else {
41566 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
41567 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
41568 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
41569 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
41570 return false;
41571 }
41572
41573 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41574 bool Op0Even;
41575 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41576 return false;
41577
41578 // It's a subadd if the vector in the even parity is an FADD.
41579 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
41580 : V2->getOpcode() == ISD::FADD;
41581
41582 Opnd0 = LHS;
41583 Opnd1 = RHS;
41584 return true;
41585}
41586
41587/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
41589 const X86Subtarget &Subtarget,
41590 SelectionDAG &DAG) {
41591 // We only handle target-independent shuffles.
41592 // FIXME: It would be easy and harmless to use the target shuffle mask
41593 // extraction tool to support more.
41594 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41595 return SDValue();
41596
41597 MVT VT = N->getSimpleValueType(0);
41598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41599 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
41600 return SDValue();
41601
41602 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
41603 SDValue Op0 = N->getOperand(0);
41604 SDValue Op1 = N->getOperand(1);
41605 SDValue FMAdd = Op0, FMSub = Op1;
41606 if (FMSub.getOpcode() != X86ISD::FMSUB)
41607 std::swap(FMAdd, FMSub);
41608
41609 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
41610 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
41611 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
41612 FMAdd.getOperand(2) != FMSub.getOperand(2))
41613 return SDValue();
41614
41615 // Check for correct shuffle mask.
41616 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41617 bool Op0Even;
41618 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41619 return SDValue();
41620
41621 // FMAddSub takes zeroth operand from FMSub node.
41622 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
41623 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41624 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
41625 FMAdd.getOperand(2));
41626}
41627
41628/// Try to combine a shuffle into a target-specific add-sub or
41629/// mul-add-sub node.
41631 const X86Subtarget &Subtarget,
41632 SelectionDAG &DAG) {
41633 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
41634 return V;
41635
41636 SDValue Opnd0, Opnd1;
41637 bool IsSubAdd;
41638 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
41639 return SDValue();
41640
41641 MVT VT = N->getSimpleValueType(0);
41642
41643 // Try to generate X86ISD::FMADDSUB node here.
41644 SDValue Opnd2;
41645 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
41646 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41647 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
41648 }
41649
41650 if (IsSubAdd)
41651 return SDValue();
41652
41653 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
41654 // the ADDSUB idiom has been successfully recognized. There are no known
41655 // X86 targets with 512-bit ADDSUB instructions!
41656 if (VT.is512BitVector())
41657 return SDValue();
41658
41659 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
41660 // the ADDSUB idiom has been successfully recognized. There are no known
41661 // X86 targets with FP16 ADDSUB instructions!
41662 if (VT.getVectorElementType() == MVT::f16)
41663 return SDValue();
41664
41665 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
41666}
41667
41668// We are looking for a shuffle where both sources are concatenated with undef
41669// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
41670// if we can express this as a single-source shuffle, that's preferable.
41672 SelectionDAG &DAG,
41673 const X86Subtarget &Subtarget) {
41674 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
41675 return SDValue();
41676
41677 EVT VT = N->getValueType(0);
41678
41679 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
41680 if (!VT.is128BitVector() && !VT.is256BitVector())
41681 return SDValue();
41682
41683 if (VT.getVectorElementType() != MVT::i32 &&
41684 VT.getVectorElementType() != MVT::i64 &&
41685 VT.getVectorElementType() != MVT::f32 &&
41686 VT.getVectorElementType() != MVT::f64)
41687 return SDValue();
41688
41689 SDValue N0 = N->getOperand(0);
41690 SDValue N1 = N->getOperand(1);
41691
41692 // Check that both sources are concats with undef.
41693 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
41694 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
41695 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
41696 !N1.getOperand(1).isUndef())
41697 return SDValue();
41698
41699 // Construct the new shuffle mask. Elements from the first source retain their
41700 // index, but elements from the second source no longer need to skip an undef.
41702 int NumElts = VT.getVectorNumElements();
41703
41704 auto *SVOp = cast<ShuffleVectorSDNode>(N);
41705 for (int Elt : SVOp->getMask())
41706 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
41707
41709 N1.getOperand(0));
41710 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
41711}
41712
41713/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
41714/// low half of each source vector and does not set any high half elements in
41715/// the destination vector, narrow the shuffle to half its original size.
41717 EVT VT = Shuf->getValueType(0);
41718 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
41719 return SDValue();
41720 if (!VT.is256BitVector() && !VT.is512BitVector())
41721 return SDValue();
41722
41723 // See if we can ignore all of the high elements of the shuffle.
41724 ArrayRef<int> Mask = Shuf->getMask();
41725 if (!isUndefUpperHalf(Mask))
41726 return SDValue();
41727
41728 // Check if the shuffle mask accesses only the low half of each input vector
41729 // (half-index output is 0 or 2).
41730 int HalfIdx1, HalfIdx2;
41731 SmallVector<int, 8> HalfMask(Mask.size() / 2);
41732 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
41733 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
41734 return SDValue();
41735
41736 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
41737 // The trick is knowing that all of the insert/extract are actually free
41738 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
41739 // of narrow inputs into a narrow output, and that is always cheaper than
41740 // the wide shuffle that we started with.
41741 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
41742 Shuf->getOperand(1), HalfMask, HalfIdx1,
41743 HalfIdx2, false, DAG, /*UseConcat*/ true);
41744}
41745
41748 const X86Subtarget &Subtarget) {
41749 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
41750 if (SDValue V = narrowShuffle(Shuf, DAG))
41751 return V;
41752
41753 // If we have legalized the vector types, look for blends of FADD and FSUB
41754 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
41755 SDLoc dl(N);
41756 EVT VT = N->getValueType(0);
41757 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41758 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
41759 if (SDValue AddSub =
41760 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
41761 return AddSub;
41762
41763 // Attempt to combine into a vector load/broadcast.
41765 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
41766 return LD;
41767
41768 // For AVX2, we sometimes want to combine
41769 // (vector_shuffle <mask> (concat_vectors t1, undef)
41770 // (concat_vectors t2, undef))
41771 // Into:
41772 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
41773 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
41774 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
41775 return ShufConcat;
41776
41777 if (isTargetShuffle(N->getOpcode())) {
41778 SDValue Op(N, 0);
41779 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
41780 return Shuffle;
41781
41782 // Try recursively combining arbitrary sequences of x86 shuffle
41783 // instructions into higher-order shuffles. We do this after combining
41784 // specific PSHUF instruction sequences into their minimal form so that we
41785 // can evaluate how many specialized shuffle instructions are involved in
41786 // a particular chain.
41787 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
41788 return Res;
41789
41790 // Simplify source operands based on shuffle mask.
41791 // TODO - merge this into combineX86ShufflesRecursively.
41792 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
41793 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
41794 return SDValue(N, 0);
41795
41796 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41797 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41798 // Perform this after other shuffle combines to allow inner shuffles to be
41799 // combined away first.
41800 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
41801 return BinOp;
41802 }
41803
41804 return SDValue();
41805}
41806
41807// Simplify variable target shuffle masks based on the demanded elements.
41808// TODO: Handle DemandedBits in mask indices as well?
41810 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
41811 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
41812 // If we're demanding all elements don't bother trying to simplify the mask.
41813 unsigned NumElts = DemandedElts.getBitWidth();
41814 if (DemandedElts.isAllOnes())
41815 return false;
41816
41817 SDValue Mask = Op.getOperand(MaskIndex);
41818 if (!Mask.hasOneUse())
41819 return false;
41820
41821 // Attempt to generically simplify the variable shuffle mask.
41822 APInt MaskUndef, MaskZero;
41823 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
41824 Depth + 1))
41825 return true;
41826
41827 // Attempt to extract+simplify a (constant pool load) shuffle mask.
41828 // TODO: Support other types from getTargetShuffleMaskIndices?
41830 EVT BCVT = BC.getValueType();
41831 auto *Load = dyn_cast<LoadSDNode>(BC);
41832 if (!Load || !Load->getBasePtr().hasOneUse())
41833 return false;
41834
41835 const Constant *C = getTargetConstantFromNode(Load);
41836 if (!C)
41837 return false;
41838
41839 Type *CTy = C->getType();
41840 if (!CTy->isVectorTy() ||
41841 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
41842 return false;
41843
41844 // Handle scaling for i64 elements on 32-bit targets.
41845 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
41846 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
41847 return false;
41848 unsigned Scale = NumCstElts / NumElts;
41849
41850 // Simplify mask if we have an undemanded element that is not undef.
41851 bool Simplified = false;
41852 SmallVector<Constant *, 32> ConstVecOps;
41853 for (unsigned i = 0; i != NumCstElts; ++i) {
41854 Constant *Elt = C->getAggregateElement(i);
41855 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
41856 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
41857 Simplified = true;
41858 continue;
41859 }
41860 ConstVecOps.push_back(Elt);
41861 }
41862 if (!Simplified)
41863 return false;
41864
41865 // Generate new constant pool entry + legalize immediately for the load.
41866 SDLoc DL(Op);
41867 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
41868 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
41869 SDValue NewMask = TLO.DAG.getLoad(
41870 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
41872 Load->getAlign());
41873 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
41874}
41875
41877 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
41878 TargetLoweringOpt &TLO, unsigned Depth) const {
41879 int NumElts = DemandedElts.getBitWidth();
41880 unsigned Opc = Op.getOpcode();
41881 EVT VT = Op.getValueType();
41882
41883 // Handle special case opcodes.
41884 switch (Opc) {
41885 case X86ISD::PMULDQ:
41886 case X86ISD::PMULUDQ: {
41887 APInt LHSUndef, LHSZero;
41888 APInt RHSUndef, RHSZero;
41889 SDValue LHS = Op.getOperand(0);
41890 SDValue RHS = Op.getOperand(1);
41891 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41892 Depth + 1))
41893 return true;
41894 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41895 Depth + 1))
41896 return true;
41897 // Multiply by zero.
41898 KnownZero = LHSZero | RHSZero;
41899 break;
41900 }
41901 case X86ISD::VPMADDUBSW:
41902 case X86ISD::VPMADDWD: {
41903 APInt LHSUndef, LHSZero;
41904 APInt RHSUndef, RHSZero;
41905 SDValue LHS = Op.getOperand(0);
41906 SDValue RHS = Op.getOperand(1);
41907 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
41908
41909 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
41910 Depth + 1))
41911 return true;
41912 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
41913 Depth + 1))
41914 return true;
41915
41916 // TODO: Multiply by zero.
41917
41918 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
41919 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
41920 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
41921 Depth + 1))
41922 return true;
41923 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
41924 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
41925 Depth + 1))
41926 return true;
41927 break;
41928 }
41929 case X86ISD::PSADBW: {
41930 SDValue LHS = Op.getOperand(0);
41931 SDValue RHS = Op.getOperand(1);
41932 assert(VT.getScalarType() == MVT::i64 &&
41933 LHS.getValueType() == RHS.getValueType() &&
41934 LHS.getValueType().getScalarType() == MVT::i8 &&
41935 "Unexpected PSADBW types");
41936
41937 // Aggressively peek through ops to get at the demanded elts.
41938 if (!DemandedElts.isAllOnes()) {
41939 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
41940 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
41942 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41944 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41945 if (NewLHS || NewRHS) {
41946 NewLHS = NewLHS ? NewLHS : LHS;
41947 NewRHS = NewRHS ? NewRHS : RHS;
41948 return TLO.CombineTo(
41949 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41950 }
41951 }
41952 break;
41953 }
41954 case X86ISD::VSHL:
41955 case X86ISD::VSRL:
41956 case X86ISD::VSRA: {
41957 // We only need the bottom 64-bits of the (128-bit) shift amount.
41958 SDValue Amt = Op.getOperand(1);
41959 MVT AmtVT = Amt.getSimpleValueType();
41960 assert(AmtVT.is128BitVector() && "Unexpected value type");
41961
41962 // If we reuse the shift amount just for sse shift amounts then we know that
41963 // only the bottom 64-bits are only ever used.
41964 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
41965 unsigned UseOpc = Use->getOpcode();
41966 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
41967 UseOpc == X86ISD::VSRA) &&
41968 Use->getOperand(0) != Amt;
41969 });
41970
41971 APInt AmtUndef, AmtZero;
41972 unsigned NumAmtElts = AmtVT.getVectorNumElements();
41973 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
41974 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
41975 Depth + 1, AssumeSingleUse))
41976 return true;
41977 [[fallthrough]];
41978 }
41979 case X86ISD::VSHLI:
41980 case X86ISD::VSRLI:
41981 case X86ISD::VSRAI: {
41982 SDValue Src = Op.getOperand(0);
41983 APInt SrcUndef;
41984 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
41985 Depth + 1))
41986 return true;
41987
41988 // Fold shift(0,x) -> 0
41989 if (DemandedElts.isSubsetOf(KnownZero))
41990 return TLO.CombineTo(
41991 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41992
41993 // Aggressively peek through ops to get at the demanded elts.
41994 if (!DemandedElts.isAllOnes())
41996 Src, DemandedElts, TLO.DAG, Depth + 1))
41997 return TLO.CombineTo(
41998 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
41999 break;
42000 }
42001 case X86ISD::VPSHA:
42002 case X86ISD::VPSHL:
42003 case X86ISD::VSHLV:
42004 case X86ISD::VSRLV:
42005 case X86ISD::VSRAV: {
42006 APInt LHSUndef, LHSZero;
42007 APInt RHSUndef, RHSZero;
42008 SDValue LHS = Op.getOperand(0);
42009 SDValue RHS = Op.getOperand(1);
42010 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
42011 Depth + 1))
42012 return true;
42013
42014 // Fold shift(0,x) -> 0
42015 if (DemandedElts.isSubsetOf(LHSZero))
42016 return TLO.CombineTo(
42017 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42018
42019 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
42020 Depth + 1))
42021 return true;
42022
42023 KnownZero = LHSZero;
42024 break;
42025 }
42026 case X86ISD::PCMPEQ:
42027 case X86ISD::PCMPGT: {
42028 APInt LHSUndef, LHSZero;
42029 APInt RHSUndef, RHSZero;
42030 SDValue LHS = Op.getOperand(0);
42031 SDValue RHS = Op.getOperand(1);
42032 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
42033 Depth + 1))
42034 return true;
42035 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
42036 Depth + 1))
42037 return true;
42038 break;
42039 }
42040 case X86ISD::KSHIFTL: {
42041 SDValue Src = Op.getOperand(0);
42042 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
42043 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
42044 unsigned ShiftAmt = Amt->getZExtValue();
42045
42046 if (ShiftAmt == 0)
42047 return TLO.CombineTo(Op, Src);
42048
42049 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
42050 // single shift. We can do this if the bottom bits (which are shifted
42051 // out) are never demanded.
42052 if (Src.getOpcode() == X86ISD::KSHIFTR) {
42053 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
42054 unsigned C1 = Src.getConstantOperandVal(1);
42055 unsigned NewOpc = X86ISD::KSHIFTL;
42056 int Diff = ShiftAmt - C1;
42057 if (Diff < 0) {
42058 Diff = -Diff;
42059 NewOpc = X86ISD::KSHIFTR;
42060 }
42061
42062 SDLoc dl(Op);
42063 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
42064 return TLO.CombineTo(
42065 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
42066 }
42067 }
42068
42069 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
42070 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
42071 Depth + 1))
42072 return true;
42073
42074 KnownUndef <<= ShiftAmt;
42075 KnownZero <<= ShiftAmt;
42076 KnownZero.setLowBits(ShiftAmt);
42077 break;
42078 }
42079 case X86ISD::KSHIFTR: {
42080 SDValue Src = Op.getOperand(0);
42081 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
42082 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
42083 unsigned ShiftAmt = Amt->getZExtValue();
42084
42085 if (ShiftAmt == 0)
42086 return TLO.CombineTo(Op, Src);
42087
42088 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
42089 // single shift. We can do this if the top bits (which are shifted
42090 // out) are never demanded.
42091 if (Src.getOpcode() == X86ISD::KSHIFTL) {
42092 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
42093 unsigned C1 = Src.getConstantOperandVal(1);
42094 unsigned NewOpc = X86ISD::KSHIFTR;
42095 int Diff = ShiftAmt - C1;
42096 if (Diff < 0) {
42097 Diff = -Diff;
42098 NewOpc = X86ISD::KSHIFTL;
42099 }
42100
42101 SDLoc dl(Op);
42102 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
42103 return TLO.CombineTo(
42104 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
42105 }
42106 }
42107
42108 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
42109 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
42110 Depth + 1))
42111 return true;
42112
42113 KnownUndef.lshrInPlace(ShiftAmt);
42114 KnownZero.lshrInPlace(ShiftAmt);
42115 KnownZero.setHighBits(ShiftAmt);
42116 break;
42117 }
42118 case X86ISD::ANDNP: {
42119 // ANDNP = (~LHS & RHS);
42120 SDValue LHS = Op.getOperand(0);
42121 SDValue RHS = Op.getOperand(1);
42122
42123 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
42124 APInt UndefElts;
42125 SmallVector<APInt> EltBits;
42126 int NumElts = VT.getVectorNumElements();
42127 int EltSizeInBits = VT.getScalarSizeInBits();
42128 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
42129 APInt OpElts = DemandedElts;
42130 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
42131 EltBits)) {
42132 OpBits.clearAllBits();
42133 OpElts.clearAllBits();
42134 for (int I = 0; I != NumElts; ++I) {
42135 if (!DemandedElts[I])
42136 continue;
42137 if (UndefElts[I]) {
42138 // We can't assume an undef src element gives an undef dst - the
42139 // other src might be zero.
42140 OpBits.setAllBits();
42141 OpElts.setBit(I);
42142 } else if ((Invert && !EltBits[I].isAllOnes()) ||
42143 (!Invert && !EltBits[I].isZero())) {
42144 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
42145 OpElts.setBit(I);
42146 }
42147 }
42148 }
42149 return std::make_pair(OpBits, OpElts);
42150 };
42151 APInt BitsLHS, EltsLHS;
42152 APInt BitsRHS, EltsRHS;
42153 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
42154 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
42155
42156 APInt LHSUndef, LHSZero;
42157 APInt RHSUndef, RHSZero;
42158 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
42159 Depth + 1))
42160 return true;
42161 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
42162 Depth + 1))
42163 return true;
42164
42165 if (!DemandedElts.isAllOnes()) {
42166 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
42167 TLO.DAG, Depth + 1);
42168 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
42169 TLO.DAG, Depth + 1);
42170 if (NewLHS || NewRHS) {
42171 NewLHS = NewLHS ? NewLHS : LHS;
42172 NewRHS = NewRHS ? NewRHS : RHS;
42173 return TLO.CombineTo(
42174 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
42175 }
42176 }
42177 break;
42178 }
42179 case X86ISD::CVTSI2P:
42180 case X86ISD::CVTUI2P:
42181 case X86ISD::CVTPH2PS:
42182 case X86ISD::CVTPS2PH: {
42183 SDValue Src = Op.getOperand(0);
42184 EVT SrcVT = Src.getValueType();
42185 APInt SrcUndef, SrcZero;
42186 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
42187 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
42188 Depth + 1))
42189 return true;
42190 break;
42191 }
42192 case X86ISD::PACKSS:
42193 case X86ISD::PACKUS: {
42194 SDValue N0 = Op.getOperand(0);
42195 SDValue N1 = Op.getOperand(1);
42196
42197 APInt DemandedLHS, DemandedRHS;
42198 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
42199
42200 APInt LHSUndef, LHSZero;
42201 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
42202 Depth + 1))
42203 return true;
42204 APInt RHSUndef, RHSZero;
42205 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
42206 Depth + 1))
42207 return true;
42208
42209 // TODO - pass on known zero/undef.
42210
42211 // Aggressively peek through ops to get at the demanded elts.
42212 // TODO - we should do this for all target/faux shuffles ops.
42213 if (!DemandedElts.isAllOnes()) {
42214 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
42215 TLO.DAG, Depth + 1);
42216 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
42217 TLO.DAG, Depth + 1);
42218 if (NewN0 || NewN1) {
42219 NewN0 = NewN0 ? NewN0 : N0;
42220 NewN1 = NewN1 ? NewN1 : N1;
42221 return TLO.CombineTo(Op,
42222 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
42223 }
42224 }
42225 break;
42226 }
42227 case X86ISD::HADD:
42228 case X86ISD::HSUB:
42229 case X86ISD::FHADD:
42230 case X86ISD::FHSUB: {
42231 SDValue N0 = Op.getOperand(0);
42232 SDValue N1 = Op.getOperand(1);
42233
42234 APInt DemandedLHS, DemandedRHS;
42235 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
42236
42237 APInt LHSUndef, LHSZero;
42238 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
42239 Depth + 1))
42240 return true;
42241 APInt RHSUndef, RHSZero;
42242 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
42243 Depth + 1))
42244 return true;
42245
42246 // TODO - pass on known zero/undef.
42247
42248 // Aggressively peek through ops to get at the demanded elts.
42249 // TODO: Handle repeated operands.
42250 if (N0 != N1 && !DemandedElts.isAllOnes()) {
42251 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
42252 TLO.DAG, Depth + 1);
42253 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
42254 TLO.DAG, Depth + 1);
42255 if (NewN0 || NewN1) {
42256 NewN0 = NewN0 ? NewN0 : N0;
42257 NewN1 = NewN1 ? NewN1 : N1;
42258 return TLO.CombineTo(Op,
42259 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
42260 }
42261 }
42262 break;
42263 }
42264 case X86ISD::VTRUNC:
42265 case X86ISD::VTRUNCS:
42266 case X86ISD::VTRUNCUS: {
42267 SDValue Src = Op.getOperand(0);
42268 MVT SrcVT = Src.getSimpleValueType();
42269 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
42270 APInt SrcUndef, SrcZero;
42271 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
42272 Depth + 1))
42273 return true;
42274 KnownZero = SrcZero.zextOrTrunc(NumElts);
42275 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
42276 break;
42277 }
42278 case X86ISD::BLENDI: {
42279 SmallVector<int, 16> BlendMask;
42280 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
42282 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
42283 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
42284 return TLO.CombineTo(Op, R);
42285 break;
42286 }
42287 case X86ISD::BLENDV: {
42288 APInt SelUndef, SelZero;
42289 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
42290 SelZero, TLO, Depth + 1))
42291 return true;
42292
42293 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
42294 APInt LHSUndef, LHSZero;
42295 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
42296 LHSZero, TLO, Depth + 1))
42297 return true;
42298
42299 APInt RHSUndef, RHSZero;
42300 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
42301 RHSZero, TLO, Depth + 1))
42302 return true;
42303
42304 KnownZero = LHSZero & RHSZero;
42305 KnownUndef = LHSUndef & RHSUndef;
42306 break;
42307 }
42308 case X86ISD::VZEXT_MOVL: {
42309 // If upper demanded elements are already zero then we have nothing to do.
42310 SDValue Src = Op.getOperand(0);
42311 APInt DemandedUpperElts = DemandedElts;
42312 DemandedUpperElts.clearLowBits(1);
42313 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
42314 return TLO.CombineTo(Op, Src);
42315 break;
42316 }
42317 case X86ISD::VZEXT_LOAD: {
42318 // If upper demanded elements are not demanded then simplify to a
42319 // scalar_to_vector(load()).
42321 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
42322 SDLoc DL(Op);
42323 auto *Mem = cast<MemSDNode>(Op);
42324 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
42325 Mem->getMemOperand());
42326 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
42327 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
42328 }
42329 break;
42330 }
42331 case X86ISD::VBROADCAST: {
42332 SDValue Src = Op.getOperand(0);
42333 MVT SrcVT = Src.getSimpleValueType();
42334 if (!SrcVT.isVector())
42335 break;
42336 // Don't bother broadcasting if we just need the 0'th element.
42337 if (DemandedElts == 1) {
42338 if (Src.getValueType() != VT)
42339 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
42340 SDLoc(Op));
42341 return TLO.CombineTo(Op, Src);
42342 }
42343 APInt SrcUndef, SrcZero;
42344 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
42345 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
42346 Depth + 1))
42347 return true;
42348 // Aggressively peek through src to get at the demanded elt.
42349 // TODO - we should do this for all target/faux shuffles ops.
42351 Src, SrcElts, TLO.DAG, Depth + 1))
42352 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42353 break;
42354 }
42355 case X86ISD::VPERMV:
42356 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
42357 Depth))
42358 return true;
42359 break;
42360 case X86ISD::PSHUFB:
42361 case X86ISD::VPERMV3:
42362 case X86ISD::VPERMILPV:
42363 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
42364 Depth))
42365 return true;
42366 break;
42367 case X86ISD::VPPERM:
42368 case X86ISD::VPERMIL2:
42369 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
42370 Depth))
42371 return true;
42372 break;
42373 }
42374
42375 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
42376 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
42377 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
42378 if ((VT.is256BitVector() || VT.is512BitVector()) &&
42379 DemandedElts.lshr(NumElts / 2) == 0) {
42380 unsigned SizeInBits = VT.getSizeInBits();
42381 unsigned ExtSizeInBits = SizeInBits / 2;
42382
42383 // See if 512-bit ops only use the bottom 128-bits.
42384 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
42385 ExtSizeInBits = SizeInBits / 4;
42386
42387 switch (Opc) {
42388 // Scalar broadcast.
42389 case X86ISD::VBROADCAST: {
42390 SDLoc DL(Op);
42391 SDValue Src = Op.getOperand(0);
42392 if (Src.getValueSizeInBits() > ExtSizeInBits)
42393 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
42394 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42395 ExtSizeInBits / VT.getScalarSizeInBits());
42396 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
42397 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
42398 TLO.DAG, DL, ExtSizeInBits));
42399 }
42401 SDLoc DL(Op);
42402 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
42403 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42404 ExtSizeInBits / VT.getScalarSizeInBits());
42405 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
42406 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
42407 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
42408 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
42409 MemIntr->getMemOperand());
42411 Bcst.getValue(1));
42412 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
42413 TLO.DAG, DL, ExtSizeInBits));
42414 }
42415 // Subvector broadcast.
42417 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
42418 EVT MemVT = MemIntr->getMemoryVT();
42419 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
42420 SDLoc DL(Op);
42421 SDValue Ld =
42422 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
42423 MemIntr->getBasePtr(), MemIntr->getMemOperand());
42425 Ld.getValue(1));
42426 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
42427 TLO.DAG, DL, ExtSizeInBits));
42428 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
42429 SDLoc DL(Op);
42430 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42431 ExtSizeInBits / VT.getScalarSizeInBits());
42432 if (SDValue BcstLd =
42433 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
42434 return TLO.CombineTo(Op,
42435 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
42436 TLO.DAG, DL, ExtSizeInBits));
42437 }
42438 break;
42439 }
42440 // Byte shifts by immediate.
42441 case X86ISD::VSHLDQ:
42442 case X86ISD::VSRLDQ:
42443 // Shift by uniform.
42444 case X86ISD::VSHL:
42445 case X86ISD::VSRL:
42446 case X86ISD::VSRA:
42447 // Shift by immediate.
42448 case X86ISD::VSHLI:
42449 case X86ISD::VSRLI:
42450 case X86ISD::VSRAI: {
42451 SDLoc DL(Op);
42452 SDValue Ext0 =
42453 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
42454 SDValue ExtOp =
42455 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
42456 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42457 SDValue Insert =
42458 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42459 return TLO.CombineTo(Op, Insert);
42460 }
42461 case X86ISD::VPERMI: {
42462 // Simplify PERMPD/PERMQ to extract_subvector.
42463 // TODO: This should be done in shuffle combining.
42464 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
42466 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
42467 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
42468 SDLoc DL(Op);
42469 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
42470 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42471 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
42472 return TLO.CombineTo(Op, Insert);
42473 }
42474 }
42475 break;
42476 }
42477 case X86ISD::VPERM2X128: {
42478 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
42479 SDLoc DL(Op);
42480 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
42481 if (LoMask & 0x8)
42482 return TLO.CombineTo(
42483 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
42484 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
42485 unsigned SrcIdx = (LoMask & 0x2) >> 1;
42486 SDValue ExtOp =
42487 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
42488 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42489 SDValue Insert =
42490 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42491 return TLO.CombineTo(Op, Insert);
42492 }
42493 // Zero upper elements.
42494 case X86ISD::VZEXT_MOVL:
42495 // Target unary shuffles by immediate:
42496 case X86ISD::PSHUFD:
42497 case X86ISD::PSHUFLW:
42498 case X86ISD::PSHUFHW:
42499 case X86ISD::VPERMILPI:
42500 // (Non-Lane Crossing) Target Shuffles.
42501 case X86ISD::VPERMILPV:
42502 case X86ISD::VPERMIL2:
42503 case X86ISD::PSHUFB:
42504 case X86ISD::UNPCKL:
42505 case X86ISD::UNPCKH:
42506 case X86ISD::BLENDI:
42507 // Integer ops.
42508 case X86ISD::PACKSS:
42509 case X86ISD::PACKUS:
42510 case X86ISD::PCMPEQ:
42511 case X86ISD::PCMPGT:
42512 case X86ISD::PMULUDQ:
42513 case X86ISD::PMULDQ:
42514 case X86ISD::VSHLV:
42515 case X86ISD::VSRLV:
42516 case X86ISD::VSRAV:
42517 // Float ops.
42518 case X86ISD::FMAX:
42519 case X86ISD::FMIN:
42520 case X86ISD::FMAXC:
42521 case X86ISD::FMINC:
42522 // Horizontal Ops.
42523 case X86ISD::HADD:
42524 case X86ISD::HSUB:
42525 case X86ISD::FHADD:
42526 case X86ISD::FHSUB: {
42527 SDLoc DL(Op);
42529 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
42530 SDValue SrcOp = Op.getOperand(i);
42531 EVT SrcVT = SrcOp.getValueType();
42532 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
42533 "Unsupported vector size");
42534 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
42535 ExtSizeInBits)
42536 : SrcOp);
42537 }
42538 MVT ExtVT = VT.getSimpleVT();
42539 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
42540 ExtSizeInBits / ExtVT.getScalarSizeInBits());
42541 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
42542 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42543 SDValue Insert =
42544 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42545 return TLO.CombineTo(Op, Insert);
42546 }
42547 }
42548 }
42549
42550 // For splats, unless we *only* demand the 0'th element,
42551 // stop attempts at simplification here, we aren't going to improve things,
42552 // this is better than any potential shuffle.
42553 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
42554 return false;
42555
42556 // Get target/faux shuffle mask.
42557 APInt OpUndef, OpZero;
42558 SmallVector<int, 64> OpMask;
42559 SmallVector<SDValue, 2> OpInputs;
42560 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
42561 OpZero, TLO.DAG, Depth, false))
42562 return false;
42563
42564 // Shuffle inputs must be the same size as the result.
42565 if (OpMask.size() != (unsigned)NumElts ||
42566 llvm::any_of(OpInputs, [VT](SDValue V) {
42567 return VT.getSizeInBits() != V.getValueSizeInBits() ||
42568 !V.getValueType().isVector();
42569 }))
42570 return false;
42571
42572 KnownZero = OpZero;
42573 KnownUndef = OpUndef;
42574
42575 // Check if shuffle mask can be simplified to undef/zero/identity.
42576 int NumSrcs = OpInputs.size();
42577 for (int i = 0; i != NumElts; ++i)
42578 if (!DemandedElts[i])
42579 OpMask[i] = SM_SentinelUndef;
42580
42581 if (isUndefInRange(OpMask, 0, NumElts)) {
42582 KnownUndef.setAllBits();
42583 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
42584 }
42585 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
42586 KnownZero.setAllBits();
42587 return TLO.CombineTo(
42588 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42589 }
42590 for (int Src = 0; Src != NumSrcs; ++Src)
42591 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
42592 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
42593
42594 // Attempt to simplify inputs.
42595 for (int Src = 0; Src != NumSrcs; ++Src) {
42596 // TODO: Support inputs of different types.
42597 if (OpInputs[Src].getValueType() != VT)
42598 continue;
42599
42600 int Lo = Src * NumElts;
42601 APInt SrcElts = APInt::getZero(NumElts);
42602 for (int i = 0; i != NumElts; ++i)
42603 if (DemandedElts[i]) {
42604 int M = OpMask[i] - Lo;
42605 if (0 <= M && M < NumElts)
42606 SrcElts.setBit(M);
42607 }
42608
42609 // TODO - Propagate input undef/zero elts.
42610 APInt SrcUndef, SrcZero;
42611 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
42612 TLO, Depth + 1))
42613 return true;
42614 }
42615
42616 // If we don't demand all elements, then attempt to combine to a simpler
42617 // shuffle.
42618 // We need to convert the depth to something combineX86ShufflesRecursively
42619 // can handle - so pretend its Depth == 0 again, and reduce the max depth
42620 // to match. This prevents combineX86ShuffleChain from returning a
42621 // combined shuffle that's the same as the original root, causing an
42622 // infinite loop.
42623 if (!DemandedElts.isAllOnes()) {
42624 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
42625
42626 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
42627 for (int i = 0; i != NumElts; ++i)
42628 if (DemandedElts[i])
42629 DemandedMask[i] = i;
42630
42632 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
42633 /*HasVarMask*/ false,
42634 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
42635 Subtarget);
42636 if (NewShuffle)
42637 return TLO.CombineTo(Op, NewShuffle);
42638 }
42639
42640 return false;
42641}
42642
42644 SDValue Op, const APInt &OriginalDemandedBits,
42645 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
42646 unsigned Depth) const {
42647 EVT VT = Op.getValueType();
42648 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
42649 unsigned Opc = Op.getOpcode();
42650 switch(Opc) {
42651 case X86ISD::VTRUNC: {
42652 KnownBits KnownOp;
42653 SDValue Src = Op.getOperand(0);
42654 MVT SrcVT = Src.getSimpleValueType();
42655
42656 // Simplify the input, using demanded bit information.
42657 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
42658 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
42659 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
42660 return true;
42661 break;
42662 }
42663 case X86ISD::PMULDQ:
42664 case X86ISD::PMULUDQ: {
42665 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
42666 KnownBits KnownLHS, KnownRHS;
42667 SDValue LHS = Op.getOperand(0);
42668 SDValue RHS = Op.getOperand(1);
42669
42670 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
42671 // FIXME: Can we bound this better?
42672 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
42673 APInt DemandedMaskLHS = APInt::getAllOnes(64);
42674 APInt DemandedMaskRHS = APInt::getAllOnes(64);
42675
42676 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
42677 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
42678 DemandedMaskLHS = DemandedMask;
42679 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
42680 DemandedMaskRHS = DemandedMask;
42681
42682 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
42683 KnownLHS, TLO, Depth + 1))
42684 return true;
42685 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
42686 KnownRHS, TLO, Depth + 1))
42687 return true;
42688
42689 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
42690 KnownRHS = KnownRHS.trunc(32);
42691 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
42692 KnownRHS.getConstant().isOne()) {
42693 SDLoc DL(Op);
42694 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
42695 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
42696 }
42697
42698 // Aggressively peek through ops to get at the demanded low bits.
42700 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42702 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42703 if (DemandedLHS || DemandedRHS) {
42704 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
42705 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
42706 return TLO.CombineTo(
42707 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
42708 }
42709 break;
42710 }
42711 case X86ISD::ANDNP: {
42712 KnownBits Known2;
42713 SDValue Op0 = Op.getOperand(0);
42714 SDValue Op1 = Op.getOperand(1);
42715
42716 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
42717 Known, TLO, Depth + 1))
42718 return true;
42719
42720 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
42721 OriginalDemandedElts, Known2, TLO, Depth + 1))
42722 return true;
42723
42724 // If the RHS is a constant, see if we can simplify it.
42725 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
42726 OriginalDemandedElts, TLO))
42727 return true;
42728
42729 // ANDNP = (~Op0 & Op1);
42730 Known.One &= Known2.Zero;
42731 Known.Zero |= Known2.One;
42732 break;
42733 }
42734 case X86ISD::VSHLI: {
42735 SDValue Op0 = Op.getOperand(0);
42736
42737 unsigned ShAmt = Op.getConstantOperandVal(1);
42738 if (ShAmt >= BitWidth)
42739 break;
42740
42741 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
42742
42743 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
42744 // single shift. We can do this if the bottom bits (which are shifted
42745 // out) are never demanded.
42746 if (Op0.getOpcode() == X86ISD::VSRLI &&
42747 OriginalDemandedBits.countr_zero() >= ShAmt) {
42748 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
42749 if (Shift2Amt < BitWidth) {
42750 int Diff = ShAmt - Shift2Amt;
42751 if (Diff == 0)
42752 return TLO.CombineTo(Op, Op0.getOperand(0));
42753
42754 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
42755 SDValue NewShift = TLO.DAG.getNode(
42756 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
42757 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
42758 return TLO.CombineTo(Op, NewShift);
42759 }
42760 }
42761
42762 // If we are only demanding sign bits then we can use the shift source directly.
42763 unsigned NumSignBits =
42764 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
42765 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
42766 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42767 return TLO.CombineTo(Op, Op0);
42768
42769 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42770 TLO, Depth + 1))
42771 return true;
42772
42773 Known.Zero <<= ShAmt;
42774 Known.One <<= ShAmt;
42775
42776 // Low bits known zero.
42777 Known.Zero.setLowBits(ShAmt);
42778 return false;
42779 }
42780 case X86ISD::VSRLI: {
42781 unsigned ShAmt = Op.getConstantOperandVal(1);
42782 if (ShAmt >= BitWidth)
42783 break;
42784
42785 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42786
42787 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
42788 OriginalDemandedElts, Known, TLO, Depth + 1))
42789 return true;
42790
42791 Known.Zero.lshrInPlace(ShAmt);
42792 Known.One.lshrInPlace(ShAmt);
42793
42794 // High bits known zero.
42795 Known.Zero.setHighBits(ShAmt);
42796 return false;
42797 }
42798 case X86ISD::VSRAI: {
42799 SDValue Op0 = Op.getOperand(0);
42800 SDValue Op1 = Op.getOperand(1);
42801
42802 unsigned ShAmt = Op1->getAsZExtVal();
42803 if (ShAmt >= BitWidth)
42804 break;
42805
42806 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42807
42808 // If we just want the sign bit then we don't need to shift it.
42809 if (OriginalDemandedBits.isSignMask())
42810 return TLO.CombineTo(Op, Op0);
42811
42812 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
42813 if (Op0.getOpcode() == X86ISD::VSHLI &&
42814 Op.getOperand(1) == Op0.getOperand(1)) {
42815 SDValue Op00 = Op0.getOperand(0);
42816 unsigned NumSignBits =
42817 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
42818 if (ShAmt < NumSignBits)
42819 return TLO.CombineTo(Op, Op00);
42820 }
42821
42822 // If any of the demanded bits are produced by the sign extension, we also
42823 // demand the input sign bit.
42824 if (OriginalDemandedBits.countl_zero() < ShAmt)
42825 DemandedMask.setSignBit();
42826
42827 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42828 TLO, Depth + 1))
42829 return true;
42830
42831 Known.Zero.lshrInPlace(ShAmt);
42832 Known.One.lshrInPlace(ShAmt);
42833
42834 // If the input sign bit is known to be zero, or if none of the top bits
42835 // are demanded, turn this into an unsigned shift right.
42836 if (Known.Zero[BitWidth - ShAmt - 1] ||
42837 OriginalDemandedBits.countl_zero() >= ShAmt)
42838 return TLO.CombineTo(
42839 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
42840
42841 // High bits are known one.
42842 if (Known.One[BitWidth - ShAmt - 1])
42843 Known.One.setHighBits(ShAmt);
42844 return false;
42845 }
42846 case X86ISD::BLENDV: {
42847 SDValue Sel = Op.getOperand(0);
42848 SDValue LHS = Op.getOperand(1);
42849 SDValue RHS = Op.getOperand(2);
42850
42851 APInt SignMask = APInt::getSignMask(BitWidth);
42853 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
42855 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42857 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42858
42859 if (NewSel || NewLHS || NewRHS) {
42860 NewSel = NewSel ? NewSel : Sel;
42861 NewLHS = NewLHS ? NewLHS : LHS;
42862 NewRHS = NewRHS ? NewRHS : RHS;
42863 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
42864 NewSel, NewLHS, NewRHS));
42865 }
42866 break;
42867 }
42868 case X86ISD::PEXTRB:
42869 case X86ISD::PEXTRW: {
42870 SDValue Vec = Op.getOperand(0);
42871 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
42872 MVT VecVT = Vec.getSimpleValueType();
42873 unsigned NumVecElts = VecVT.getVectorNumElements();
42874
42875 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
42876 unsigned Idx = CIdx->getZExtValue();
42877 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
42878
42879 // If we demand no bits from the vector then we must have demanded
42880 // bits from the implict zext - simplify to zero.
42881 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
42882 if (DemandedVecBits == 0)
42883 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42884
42885 APInt KnownUndef, KnownZero;
42886 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
42887 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
42888 KnownZero, TLO, Depth + 1))
42889 return true;
42890
42891 KnownBits KnownVec;
42892 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
42893 KnownVec, TLO, Depth + 1))
42894 return true;
42895
42897 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
42898 return TLO.CombineTo(
42899 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
42900
42901 Known = KnownVec.zext(BitWidth);
42902 return false;
42903 }
42904 break;
42905 }
42906 case X86ISD::PINSRB:
42907 case X86ISD::PINSRW: {
42908 SDValue Vec = Op.getOperand(0);
42909 SDValue Scl = Op.getOperand(1);
42910 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42911 MVT VecVT = Vec.getSimpleValueType();
42912
42913 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
42914 unsigned Idx = CIdx->getZExtValue();
42915 if (!OriginalDemandedElts[Idx])
42916 return TLO.CombineTo(Op, Vec);
42917
42918 KnownBits KnownVec;
42919 APInt DemandedVecElts(OriginalDemandedElts);
42920 DemandedVecElts.clearBit(Idx);
42921 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
42922 KnownVec, TLO, Depth + 1))
42923 return true;
42924
42925 KnownBits KnownScl;
42926 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
42927 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
42928 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
42929 return true;
42930
42931 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
42932 Known = KnownVec.intersectWith(KnownScl);
42933 return false;
42934 }
42935 break;
42936 }
42937 case X86ISD::PACKSS:
42938 // PACKSS saturates to MIN/MAX integer values. So if we just want the
42939 // sign bit then we can just ask for the source operands sign bit.
42940 // TODO - add known bits handling.
42941 if (OriginalDemandedBits.isSignMask()) {
42942 APInt DemandedLHS, DemandedRHS;
42943 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
42944
42945 KnownBits KnownLHS, KnownRHS;
42946 APInt SignMask = APInt::getSignMask(BitWidth * 2);
42947 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
42948 KnownLHS, TLO, Depth + 1))
42949 return true;
42950 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
42951 KnownRHS, TLO, Depth + 1))
42952 return true;
42953
42954 // Attempt to avoid multi-use ops if we don't need anything from them.
42956 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
42958 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
42959 if (DemandedOp0 || DemandedOp1) {
42960 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
42961 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
42962 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
42963 }
42964 }
42965 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
42966 break;
42967 case X86ISD::VBROADCAST: {
42968 SDValue Src = Op.getOperand(0);
42969 MVT SrcVT = Src.getSimpleValueType();
42970 APInt DemandedElts = APInt::getOneBitSet(
42971 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
42972 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
42973 TLO, Depth + 1))
42974 return true;
42975 // If we don't need the upper bits, attempt to narrow the broadcast source.
42976 // Don't attempt this on AVX512 as it might affect broadcast folding.
42977 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
42978 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
42979 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
42980 Src->hasOneUse()) {
42981 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
42982 SDValue NewSrc =
42983 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
42984 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
42985 SDValue NewBcst =
42986 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
42987 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
42988 }
42989 break;
42990 }
42991 case X86ISD::PCMPGT:
42992 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42993 // iff we only need the sign bit then we can use R directly.
42994 if (OriginalDemandedBits.isSignMask() &&
42995 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42996 return TLO.CombineTo(Op, Op.getOperand(1));
42997 break;
42998 case X86ISD::MOVMSK: {
42999 SDValue Src = Op.getOperand(0);
43000 MVT SrcVT = Src.getSimpleValueType();
43001 unsigned SrcBits = SrcVT.getScalarSizeInBits();
43002 unsigned NumElts = SrcVT.getVectorNumElements();
43003
43004 // If we don't need the sign bits at all just return zero.
43005 if (OriginalDemandedBits.countr_zero() >= NumElts)
43006 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43007
43008 // See if we only demand bits from the lower 128-bit vector.
43009 if (SrcVT.is256BitVector() &&
43010 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
43011 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
43012 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43013 }
43014
43015 // Only demand the vector elements of the sign bits we need.
43016 APInt KnownUndef, KnownZero;
43017 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
43018 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
43019 TLO, Depth + 1))
43020 return true;
43021
43022 Known.Zero = KnownZero.zext(BitWidth);
43023 Known.Zero.setHighBits(BitWidth - NumElts);
43024
43025 // MOVMSK only uses the MSB from each vector element.
43026 KnownBits KnownSrc;
43027 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
43028 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
43029 Depth + 1))
43030 return true;
43031
43032 if (KnownSrc.One[SrcBits - 1])
43033 Known.One.setLowBits(NumElts);
43034 else if (KnownSrc.Zero[SrcBits - 1])
43035 Known.Zero.setLowBits(NumElts);
43036
43037 // Attempt to avoid multi-use os if we don't need anything from it.
43039 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
43040 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43041 return false;
43042 }
43043 case X86ISD::TESTP: {
43044 SDValue Op0 = Op.getOperand(0);
43045 SDValue Op1 = Op.getOperand(1);
43046 MVT OpVT = Op0.getSimpleValueType();
43047 assert((OpVT.getVectorElementType() == MVT::f32 ||
43048 OpVT.getVectorElementType() == MVT::f64) &&
43049 "Illegal vector type for X86ISD::TESTP");
43050
43051 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
43052 KnownBits KnownSrc;
43053 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
43054 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
43055 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
43056 AssumeSingleUse) ||
43057 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
43058 AssumeSingleUse);
43059 }
43060 case X86ISD::CMOV: {
43061 KnownBits Known2;
43062 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
43063 OriginalDemandedElts, Known2, TLO, Depth + 1))
43064 return true;
43065 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
43066 OriginalDemandedElts, Known, TLO, Depth + 1))
43067 return true;
43068
43069 // Only known if known in both the LHS and RHS.
43070 Known = Known.intersectWith(Known2);
43071 break;
43072 }
43073 case X86ISD::BEXTR:
43074 case X86ISD::BEXTRI: {
43075 SDValue Op0 = Op.getOperand(0);
43076 SDValue Op1 = Op.getOperand(1);
43077
43078 // Only bottom 16-bits of the control bits are required.
43079 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
43080 // NOTE: SimplifyDemandedBits won't do this for constants.
43081 uint64_t Val1 = Cst1->getZExtValue();
43082 uint64_t MaskedVal1 = Val1 & 0xFFFF;
43083 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
43084 SDLoc DL(Op);
43085 return TLO.CombineTo(
43086 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
43087 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
43088 }
43089
43090 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
43091 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
43092
43093 // If the length is 0, the result is 0.
43094 if (Length == 0) {
43095 Known.setAllZero();
43096 return false;
43097 }
43098
43099 if ((Shift + Length) <= BitWidth) {
43100 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
43101 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
43102 return true;
43103
43104 Known = Known.extractBits(Length, Shift);
43105 Known = Known.zextOrTrunc(BitWidth);
43106 return false;
43107 }
43108 } else {
43109 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
43110 KnownBits Known1;
43111 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
43112 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
43113 return true;
43114
43115 // If the length is 0, replace with 0.
43116 KnownBits LengthBits = Known1.extractBits(8, 8);
43117 if (LengthBits.isZero())
43118 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43119 }
43120
43121 break;
43122 }
43123 case X86ISD::PDEP: {
43124 SDValue Op0 = Op.getOperand(0);
43125 SDValue Op1 = Op.getOperand(1);
43126
43127 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
43128 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
43129
43130 // If the demanded bits has leading zeroes, we don't demand those from the
43131 // mask.
43132 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
43133 return true;
43134
43135 // The number of possible 1s in the mask determines the number of LSBs of
43136 // operand 0 used. Undemanded bits from the mask don't matter so filter
43137 // them before counting.
43138 KnownBits Known2;
43139 uint64_t Count = (~Known.Zero & LoMask).popcount();
43140 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
43141 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
43142 return true;
43143
43144 // Zeroes are retained from the mask, but not ones.
43145 Known.One.clearAllBits();
43146 // The result will have at least as many trailing zeros as the non-mask
43147 // operand since bits can only map to the same or higher bit position.
43148 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
43149 return false;
43150 }
43151 }
43152
43154 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
43155}
43156
43158 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
43159 SelectionDAG &DAG, unsigned Depth) const {
43160 int NumElts = DemandedElts.getBitWidth();
43161 unsigned Opc = Op.getOpcode();
43162 EVT VT = Op.getValueType();
43163
43164 switch (Opc) {
43165 case X86ISD::PINSRB:
43166 case X86ISD::PINSRW: {
43167 // If we don't demand the inserted element, return the base vector.
43168 SDValue Vec = Op.getOperand(0);
43169 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
43170 MVT VecVT = Vec.getSimpleValueType();
43171 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
43172 !DemandedElts[CIdx->getZExtValue()])
43173 return Vec;
43174 break;
43175 }
43176 case X86ISD::VSHLI: {
43177 // If we are only demanding sign bits then we can use the shift source
43178 // directly.
43179 SDValue Op0 = Op.getOperand(0);
43180 unsigned ShAmt = Op.getConstantOperandVal(1);
43181 unsigned BitWidth = DemandedBits.getBitWidth();
43182 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
43183 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
43184 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43185 return Op0;
43186 break;
43187 }
43188 case X86ISD::VSRAI:
43189 // iff we only need the sign bit then we can use the source directly.
43190 // TODO: generalize where we only demand extended signbits.
43191 if (DemandedBits.isSignMask())
43192 return Op.getOperand(0);
43193 break;
43194 case X86ISD::PCMPGT:
43195 // icmp sgt(0, R) == ashr(R, BitWidth-1).
43196 // iff we only need the sign bit then we can use R directly.
43197 if (DemandedBits.isSignMask() &&
43198 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
43199 return Op.getOperand(1);
43200 break;
43201 case X86ISD::BLENDV: {
43202 // BLENDV: Cond (MSB) ? LHS : RHS
43203 SDValue Cond = Op.getOperand(0);
43204 SDValue LHS = Op.getOperand(1);
43205 SDValue RHS = Op.getOperand(2);
43206
43207 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
43208 if (CondKnown.isNegative())
43209 return LHS;
43210 if (CondKnown.isNonNegative())
43211 return RHS;
43212 break;
43213 }
43214 case X86ISD::ANDNP: {
43215 // ANDNP = (~LHS & RHS);
43216 SDValue LHS = Op.getOperand(0);
43217 SDValue RHS = Op.getOperand(1);
43218
43219 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
43220 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
43221
43222 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
43223 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
43224 // this context, so return RHS.
43225 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
43226 return RHS;
43227 break;
43228 }
43229 }
43230
43231 APInt ShuffleUndef, ShuffleZero;
43232 SmallVector<int, 16> ShuffleMask;
43234 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
43235 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
43236 // If all the demanded elts are from one operand and are inline,
43237 // then we can use the operand directly.
43238 int NumOps = ShuffleOps.size();
43239 if (ShuffleMask.size() == (unsigned)NumElts &&
43241 return VT.getSizeInBits() == V.getValueSizeInBits();
43242 })) {
43243
43244 if (DemandedElts.isSubsetOf(ShuffleUndef))
43245 return DAG.getUNDEF(VT);
43246 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
43247 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
43248
43249 // Bitmask that indicates which ops have only been accessed 'inline'.
43250 APInt IdentityOp = APInt::getAllOnes(NumOps);
43251 for (int i = 0; i != NumElts; ++i) {
43252 int M = ShuffleMask[i];
43253 if (!DemandedElts[i] || ShuffleUndef[i])
43254 continue;
43255 int OpIdx = M / NumElts;
43256 int EltIdx = M % NumElts;
43257 if (M < 0 || EltIdx != i) {
43258 IdentityOp.clearAllBits();
43259 break;
43260 }
43261 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
43262 if (IdentityOp == 0)
43263 break;
43264 }
43265 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
43266 "Multiple identity shuffles detected");
43267
43268 if (IdentityOp != 0)
43269 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
43270 }
43271 }
43272
43274 Op, DemandedBits, DemandedElts, DAG, Depth);
43275}
43276
43278 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
43279 bool PoisonOnly, unsigned Depth) const {
43280 unsigned NumElts = DemandedElts.getBitWidth();
43281
43282 switch (Op.getOpcode()) {
43283 case X86ISD::PSHUFD:
43284 case X86ISD::VPERMILPI: {
43287 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
43288 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
43289 APInt::getZero(NumElts));
43290 for (auto M : enumerate(Mask)) {
43291 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
43292 continue;
43293 if (M.value() == SM_SentinelUndef)
43294 return false;
43295 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
43296 "Shuffle mask index out of range");
43297 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
43298 }
43299 for (auto Op : enumerate(Ops))
43300 if (!DemandedSrcElts[Op.index()].isZero() &&
43302 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
43303 return false;
43304 return true;
43305 }
43306 break;
43307 }
43308 }
43310 Op, DemandedElts, DAG, PoisonOnly, Depth);
43311}
43312
43314 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
43315 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
43316
43317 switch (Op.getOpcode()) {
43318 // SSE vector shifts handle out of bounds shift amounts.
43319 case X86ISD::VSHLI:
43320 case X86ISD::VSRLI:
43321 case X86ISD::VSRAI:
43322 return false;
43323 case X86ISD::PSHUFD:
43324 case X86ISD::VPERMILPI:
43325 case X86ISD::UNPCKH:
43326 case X86ISD::UNPCKL:
43327 return false;
43328 }
43330 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
43331}
43332
43334 const APInt &DemandedElts,
43335 APInt &UndefElts,
43336 const SelectionDAG &DAG,
43337 unsigned Depth) const {
43338 unsigned NumElts = DemandedElts.getBitWidth();
43339 unsigned Opc = Op.getOpcode();
43340
43341 switch (Opc) {
43342 case X86ISD::VBROADCAST:
43344 UndefElts = APInt::getZero(NumElts);
43345 return true;
43346 }
43347
43348 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
43349 DAG, Depth);
43350}
43351
43352// Helper to peek through bitops/trunc/setcc to determine size of source vector.
43353// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
43354static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
43355 bool AllowTruncate) {
43356 switch (Src.getOpcode()) {
43357 case ISD::TRUNCATE:
43358 if (!AllowTruncate)
43359 return false;
43360 [[fallthrough]];
43361 case ISD::SETCC:
43362 return Src.getOperand(0).getValueSizeInBits() == Size;
43363 case ISD::FREEZE:
43364 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate);
43365 case ISD::AND:
43366 case ISD::XOR:
43367 case ISD::OR:
43368 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
43369 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
43370 case ISD::SELECT:
43371 case ISD::VSELECT:
43372 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
43373 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
43374 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
43375 case ISD::BUILD_VECTOR:
43376 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
43377 ISD::isBuildVectorAllOnes(Src.getNode());
43378 }
43379 return false;
43380}
43381
43382// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
43383static unsigned getAltBitOpcode(unsigned Opcode) {
43384 switch(Opcode) {
43385 // clang-format off
43386 case ISD::AND: return X86ISD::FAND;
43387 case ISD::OR: return X86ISD::FOR;
43388 case ISD::XOR: return X86ISD::FXOR;
43389 case X86ISD::ANDNP: return X86ISD::FANDN;
43390 // clang-format on
43391 }
43392 llvm_unreachable("Unknown bitwise opcode");
43393}
43394
43395// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
43397 const SDLoc &DL) {
43398 EVT SrcVT = Src.getValueType();
43399 if (SrcVT != MVT::v4i1)
43400 return SDValue();
43401
43402 switch (Src.getOpcode()) {
43403 case ISD::SETCC:
43404 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
43405 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
43406 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
43407 SDValue Op0 = Src.getOperand(0);
43408 if (ISD::isNormalLoad(Op0.getNode()))
43409 return DAG.getBitcast(MVT::v4f32, Op0);
43410 if (Op0.getOpcode() == ISD::BITCAST &&
43411 Op0.getOperand(0).getValueType() == MVT::v4f32)
43412 return Op0.getOperand(0);
43413 }
43414 break;
43415 case ISD::AND:
43416 case ISD::XOR:
43417 case ISD::OR: {
43418 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
43419 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
43420 if (Op0 && Op1)
43421 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
43422 Op1);
43423 break;
43424 }
43425 }
43426 return SDValue();
43427}
43428
43429// Helper to push sign extension of vXi1 SETCC result through bitops.
43431 SDValue Src, const SDLoc &DL) {
43432 switch (Src.getOpcode()) {
43433 case ISD::SETCC:
43434 case ISD::FREEZE:
43435 case ISD::TRUNCATE:
43436 case ISD::BUILD_VECTOR:
43437 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
43438 case ISD::AND:
43439 case ISD::XOR:
43440 case ISD::OR:
43441 return DAG.getNode(
43442 Src.getOpcode(), DL, SExtVT,
43443 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
43444 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
43445 case ISD::SELECT:
43446 case ISD::VSELECT:
43447 return DAG.getSelect(
43448 DL, SExtVT, Src.getOperand(0),
43449 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
43450 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
43451 }
43452 llvm_unreachable("Unexpected node type for vXi1 sign extension");
43453}
43454
43455// Try to match patterns such as
43456// (i16 bitcast (v16i1 x))
43457// ->
43458// (i16 movmsk (16i8 sext (v16i1 x)))
43459// before the illegal vector is scalarized on subtargets that don't have legal
43460// vxi1 types.
43462 const SDLoc &DL,
43463 const X86Subtarget &Subtarget) {
43464 EVT SrcVT = Src.getValueType();
43465 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
43466 return SDValue();
43467
43468 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
43469 // legalization destroys the v4i32 type.
43470 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
43471 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
43472 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
43473 DAG.getBitcast(MVT::v4f32, V));
43474 return DAG.getZExtOrTrunc(V, DL, VT);
43475 }
43476 }
43477
43478 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
43479 // movmskb even with avx512. This will be better than truncating to vXi1 and
43480 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
43481 // vpcmpeqb/vpcmpgtb.
43482 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
43483 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
43484 Src.getOperand(0).getValueType() == MVT::v32i8 ||
43485 Src.getOperand(0).getValueType() == MVT::v64i8);
43486
43487 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
43488 // directly with vpmovmskb/vmovmskps/vmovmskpd.
43489 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
43490 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
43491 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
43492 EVT CmpVT = Src.getOperand(0).getValueType();
43493 EVT EltVT = CmpVT.getVectorElementType();
43494 if (CmpVT.getSizeInBits() <= 256 &&
43495 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
43496 PreferMovMsk = true;
43497 }
43498
43499 // With AVX512 vxi1 types are legal and we prefer using k-regs.
43500 // MOVMSK is supported in SSE2 or later.
43501 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
43502 return SDValue();
43503
43504 // If the upper ops of a concatenation are undef, then try to bitcast the
43505 // lower op and extend.
43506 SmallVector<SDValue, 4> SubSrcOps;
43507 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
43508 SubSrcOps.size() >= 2) {
43509 SDValue LowerOp = SubSrcOps[0];
43510 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
43511 if (LowerOp.getOpcode() == ISD::SETCC &&
43512 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
43513 EVT SubVT = VT.getIntegerVT(
43514 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
43515 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
43516 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
43517 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
43518 }
43519 }
43520 }
43521
43522 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
43523 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
43524 // v8i16 and v16i16.
43525 // For these two cases, we can shuffle the upper element bytes to a
43526 // consecutive sequence at the start of the vector and treat the results as
43527 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
43528 // for v16i16 this is not the case, because the shuffle is expensive, so we
43529 // avoid sign-extending to this type entirely.
43530 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
43531 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
43532 MVT SExtVT;
43533 bool PropagateSExt = false;
43534 switch (SrcVT.getSimpleVT().SimpleTy) {
43535 default:
43536 return SDValue();
43537 case MVT::v2i1:
43538 SExtVT = MVT::v2i64;
43539 break;
43540 case MVT::v4i1:
43541 SExtVT = MVT::v4i32;
43542 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
43543 // sign-extend to a 256-bit operation to avoid truncation.
43544 if (Subtarget.hasAVX() &&
43545 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
43546 SExtVT = MVT::v4i64;
43547 PropagateSExt = true;
43548 }
43549 break;
43550 case MVT::v8i1:
43551 SExtVT = MVT::v8i16;
43552 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
43553 // sign-extend to a 256-bit operation to match the compare.
43554 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
43555 // 256-bit because the shuffle is cheaper than sign extending the result of
43556 // the compare.
43557 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
43558 checkBitcastSrcVectorSize(Src, 512, true))) {
43559 SExtVT = MVT::v8i32;
43560 PropagateSExt = true;
43561 }
43562 break;
43563 case MVT::v16i1:
43564 SExtVT = MVT::v16i8;
43565 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
43566 // it is not profitable to sign-extend to 256-bit because this will
43567 // require an extra cross-lane shuffle which is more expensive than
43568 // truncating the result of the compare to 128-bits.
43569 break;
43570 case MVT::v32i1:
43571 SExtVT = MVT::v32i8;
43572 break;
43573 case MVT::v64i1:
43574 // If we have AVX512F, but not AVX512BW and the input is truncated from
43575 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
43576 if (Subtarget.hasAVX512()) {
43577 if (Subtarget.hasBWI())
43578 return SDValue();
43579 SExtVT = MVT::v64i8;
43580 break;
43581 }
43582 // Split if this is a <64 x i8> comparison result.
43583 if (checkBitcastSrcVectorSize(Src, 512, false)) {
43584 SExtVT = MVT::v64i8;
43585 break;
43586 }
43587 return SDValue();
43588 };
43589
43590 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
43591 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
43592
43593 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
43594 V = getPMOVMSKB(DL, V, DAG, Subtarget);
43595 } else {
43596 if (SExtVT == MVT::v8i16) {
43597 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
43598 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
43599 }
43600 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
43601 }
43602
43603 EVT IntVT =
43605 V = DAG.getZExtOrTrunc(V, DL, IntVT);
43606 return DAG.getBitcast(VT, V);
43607}
43608
43609// Convert a vXi1 constant build vector to the same width scalar integer.
43611 EVT SrcVT = Op.getValueType();
43612 assert(SrcVT.getVectorElementType() == MVT::i1 &&
43613 "Expected a vXi1 vector");
43615 "Expected a constant build vector");
43616
43617 APInt Imm(SrcVT.getVectorNumElements(), 0);
43618 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
43619 SDValue In = Op.getOperand(Idx);
43620 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
43621 Imm.setBit(Idx);
43622 }
43623 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
43624 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
43625}
43626
43629 const X86Subtarget &Subtarget) {
43630 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
43631
43632 if (!DCI.isBeforeLegalizeOps())
43633 return SDValue();
43634
43635 // Only do this if we have k-registers.
43636 if (!Subtarget.hasAVX512())
43637 return SDValue();
43638
43639 EVT DstVT = N->getValueType(0);
43640 SDValue Op = N->getOperand(0);
43641 EVT SrcVT = Op.getValueType();
43642
43643 if (!Op.hasOneUse())
43644 return SDValue();
43645
43646 // Look for logic ops.
43647 if (Op.getOpcode() != ISD::AND &&
43648 Op.getOpcode() != ISD::OR &&
43649 Op.getOpcode() != ISD::XOR)
43650 return SDValue();
43651
43652 // Make sure we have a bitcast between mask registers and a scalar type.
43653 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43654 DstVT.isScalarInteger()) &&
43655 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
43656 SrcVT.isScalarInteger()))
43657 return SDValue();
43658
43659 SDValue LHS = Op.getOperand(0);
43660 SDValue RHS = Op.getOperand(1);
43661
43662 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
43663 LHS.getOperand(0).getValueType() == DstVT)
43664 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
43665 DAG.getBitcast(DstVT, RHS));
43666
43667 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
43668 RHS.getOperand(0).getValueType() == DstVT)
43669 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43670 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
43671
43672 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
43673 // Most of these have to move a constant from the scalar domain anyway.
43676 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43677 DAG.getBitcast(DstVT, LHS), RHS);
43678 }
43679
43680 return SDValue();
43681}
43682
43684 const X86Subtarget &Subtarget) {
43685 SDLoc DL(BV);
43686 unsigned NumElts = BV->getNumOperands();
43687 SDValue Splat = BV->getSplatValue();
43688
43689 // Build MMX element from integer GPR or SSE float values.
43690 auto CreateMMXElement = [&](SDValue V) {
43691 if (V.isUndef())
43692 return DAG.getUNDEF(MVT::x86mmx);
43693 if (V.getValueType().isFloatingPoint()) {
43694 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
43695 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
43696 V = DAG.getBitcast(MVT::v2i64, V);
43697 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
43698 }
43699 V = DAG.getBitcast(MVT::i32, V);
43700 } else {
43701 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
43702 }
43703 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
43704 };
43705
43706 // Convert build vector ops to MMX data in the bottom elements.
43708
43709 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43710
43711 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
43712 if (Splat) {
43713 if (Splat.isUndef())
43714 return DAG.getUNDEF(MVT::x86mmx);
43715
43716 Splat = CreateMMXElement(Splat);
43717
43718 if (Subtarget.hasSSE1()) {
43719 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
43720 if (NumElts == 8)
43721 Splat = DAG.getNode(
43722 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43723 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
43724 TLI.getPointerTy(DAG.getDataLayout())),
43725 Splat, Splat);
43726
43727 // Use PSHUFW to repeat 16-bit elements.
43728 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
43729 return DAG.getNode(
43730 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43731 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
43732 TLI.getPointerTy(DAG.getDataLayout())),
43733 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
43734 }
43735 Ops.append(NumElts, Splat);
43736 } else {
43737 for (unsigned i = 0; i != NumElts; ++i)
43738 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
43739 }
43740
43741 // Use tree of PUNPCKLs to build up general MMX vector.
43742 while (Ops.size() > 1) {
43743 unsigned NumOps = Ops.size();
43744 unsigned IntrinOp =
43745 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
43746 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
43747 : Intrinsic::x86_mmx_punpcklbw));
43748 SDValue Intrin = DAG.getTargetConstant(
43749 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
43750 for (unsigned i = 0; i != NumOps; i += 2)
43751 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
43752 Ops[i], Ops[i + 1]);
43753 Ops.resize(NumOps / 2);
43754 }
43755
43756 return Ops[0];
43757}
43758
43759// Recursive function that attempts to find if a bool vector node was originally
43760// a vector/float/double that got truncated/extended/bitcast to/from a scalar
43761// integer. If so, replace the scalar ops with bool vector equivalents back down
43762// the chain.
43764 SelectionDAG &DAG,
43765 const X86Subtarget &Subtarget,
43766 unsigned Depth = 0) {
43768 return SDValue(); // Limit search depth.
43769
43770 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43771 unsigned Opc = V.getOpcode();
43772 switch (Opc) {
43773 case ISD::BITCAST: {
43774 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
43775 SDValue Src = V.getOperand(0);
43776 EVT SrcVT = Src.getValueType();
43777 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
43778 return DAG.getBitcast(VT, Src);
43779 break;
43780 }
43781 case ISD::Constant: {
43782 auto *C = cast<ConstantSDNode>(V);
43783 if (C->isZero())
43784 return DAG.getConstant(0, DL, VT);
43785 if (C->isAllOnes())
43786 return DAG.getAllOnesConstant(DL, VT);
43787 break;
43788 }
43789 case ISD::TRUNCATE: {
43790 // If we find a suitable source, a truncated scalar becomes a subvector.
43791 SDValue Src = V.getOperand(0);
43792 EVT NewSrcVT =
43793 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
43794 if (TLI.isTypeLegal(NewSrcVT))
43795 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
43796 Subtarget, Depth + 1))
43797 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
43798 DAG.getIntPtrConstant(0, DL));
43799 break;
43800 }
43801 case ISD::ANY_EXTEND:
43802 case ISD::ZERO_EXTEND: {
43803 // If we find a suitable source, an extended scalar becomes a subvector.
43804 SDValue Src = V.getOperand(0);
43805 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
43806 Src.getScalarValueSizeInBits());
43807 if (TLI.isTypeLegal(NewSrcVT))
43808 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
43809 Subtarget, Depth + 1))
43810 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
43811 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
43812 : DAG.getConstant(0, DL, VT),
43813 N0, DAG.getIntPtrConstant(0, DL));
43814 break;
43815 }
43816 case ISD::OR:
43817 case ISD::XOR: {
43818 // If we find suitable sources, we can just move the op to the vector
43819 // domain.
43820 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
43821 Subtarget, Depth + 1))
43822 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
43823 Subtarget, Depth + 1))
43824 return DAG.getNode(Opc, DL, VT, N0, N1);
43825 break;
43826 }
43827 case ISD::SHL: {
43828 // If we find a suitable source, a SHL becomes a KSHIFTL.
43829 SDValue Src0 = V.getOperand(0);
43830 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
43831 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
43832 break;
43833
43834 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
43835 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
43836 Depth + 1))
43837 return DAG.getNode(
43838 X86ISD::KSHIFTL, DL, VT, N0,
43839 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
43840 break;
43841 }
43842 }
43843
43844 // Does the inner bitcast already exist?
43845 if (Depth > 0)
43846 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
43847 return SDValue(Alt, 0);
43848
43849 return SDValue();
43850}
43851
43854 const X86Subtarget &Subtarget) {
43855 SDValue N0 = N->getOperand(0);
43856 EVT VT = N->getValueType(0);
43857 EVT SrcVT = N0.getValueType();
43858 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43859
43860 // Try to match patterns such as
43861 // (i16 bitcast (v16i1 x))
43862 // ->
43863 // (i16 movmsk (16i8 sext (v16i1 x)))
43864 // before the setcc result is scalarized on subtargets that don't have legal
43865 // vxi1 types.
43866 if (DCI.isBeforeLegalize()) {
43867 SDLoc dl(N);
43868 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
43869 return V;
43870
43871 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43872 // type, widen both sides to avoid a trip through memory.
43873 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
43874 Subtarget.hasAVX512()) {
43875 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
43876 N0 = DAG.getBitcast(MVT::v8i1, N0);
43877 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
43878 DAG.getIntPtrConstant(0, dl));
43879 }
43880
43881 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43882 // type, widen both sides to avoid a trip through memory.
43883 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
43884 Subtarget.hasAVX512()) {
43885 // Use zeros for the widening if we already have some zeroes. This can
43886 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
43887 // stream of this.
43888 // FIXME: It might make sense to detect a concat_vectors with a mix of
43889 // zeroes and undef and turn it into insert_subvector for i1 vectors as
43890 // a separate combine. What we can't do is canonicalize the operands of
43891 // such a concat or we'll get into a loop with SimplifyDemandedBits.
43892 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
43893 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
43894 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
43895 SrcVT = LastOp.getValueType();
43896 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43897 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
43898 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
43899 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43900 N0 = DAG.getBitcast(MVT::i8, N0);
43901 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43902 }
43903 }
43904
43905 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43906 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
43907 Ops[0] = N0;
43908 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43909 N0 = DAG.getBitcast(MVT::i8, N0);
43910 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43911 }
43912 } else {
43913 // If we're bitcasting from iX to vXi1, see if the integer originally
43914 // began as a vXi1 and whether we can remove the bitcast entirely.
43915 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
43916 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
43917 if (SDValue V =
43918 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
43919 return V;
43920 }
43921 }
43922
43923 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
43924 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
43925 // due to insert_subvector legalization on KNL. By promoting the copy to i16
43926 // we can help with known bits propagation from the vXi1 domain to the
43927 // scalar domain.
43928 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
43929 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43930 N0.getOperand(0).getValueType() == MVT::v16i1 &&
43932 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
43933 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
43934
43935 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
43936 // and the vbroadcast_load are both integer or both fp. In some cases this
43937 // will remove the bitcast entirely.
43938 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
43939 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
43940 auto *BCast = cast<MemIntrinsicSDNode>(N0);
43941 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
43942 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
43943 // Don't swap i8/i16 since don't have fp types that size.
43944 if (MemSize >= 32) {
43945 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
43946 : MVT::getIntegerVT(MemSize);
43947 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
43948 : MVT::getIntegerVT(SrcVTSize);
43949 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
43950
43951 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
43952 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
43953 SDValue ResNode =
43955 MemVT, BCast->getMemOperand());
43956 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
43957 return DAG.getBitcast(VT, ResNode);
43958 }
43959 }
43960
43961 // Since MMX types are special and don't usually play with other vector types,
43962 // it's better to handle them early to be sure we emit efficient code by
43963 // avoiding store-load conversions.
43964 if (VT == MVT::x86mmx) {
43965 // Detect MMX constant vectors.
43966 APInt UndefElts;
43967 SmallVector<APInt, 1> EltBits;
43968 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
43969 /*AllowWholeUndefs*/ true,
43970 /*AllowPartialUndefs*/ true)) {
43971 SDLoc DL(N0);
43972 // Handle zero-extension of i32 with MOVD.
43973 if (EltBits[0].countl_zero() >= 32)
43974 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
43975 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
43976 // Else, bitcast to a double.
43977 // TODO - investigate supporting sext 32-bit immediates on x86_64.
43978 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
43979 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
43980 }
43981
43982 // Detect bitcasts to x86mmx low word.
43983 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43984 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
43985 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
43986 bool LowUndef = true, AllUndefOrZero = true;
43987 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
43988 SDValue Op = N0.getOperand(i);
43989 LowUndef &= Op.isUndef() || (i >= e/2);
43990 AllUndefOrZero &= isNullConstantOrUndef(Op);
43991 }
43992 if (AllUndefOrZero) {
43993 SDValue N00 = N0.getOperand(0);
43994 SDLoc dl(N00);
43995 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
43996 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
43997 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
43998 }
43999 }
44000
44001 // Detect bitcasts of 64-bit build vectors and convert to a
44002 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
44003 // lowest element.
44004 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
44005 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
44006 SrcVT == MVT::v8i8))
44007 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
44008
44009 // Detect bitcasts between element or subvector extraction to x86mmx.
44010 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
44012 isNullConstant(N0.getOperand(1))) {
44013 SDValue N00 = N0.getOperand(0);
44014 if (N00.getValueType().is128BitVector())
44015 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
44016 DAG.getBitcast(MVT::v2i64, N00));
44017 }
44018
44019 // Detect bitcasts from FP_TO_SINT to x86mmx.
44020 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
44021 SDLoc DL(N0);
44022 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
44023 DAG.getUNDEF(MVT::v2i32));
44024 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
44025 DAG.getBitcast(MVT::v2i64, Res));
44026 }
44027 }
44028
44029 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
44030 // most of these to scalar anyway.
44031 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
44032 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44034 return combinevXi1ConstantToInteger(N0, DAG);
44035 }
44036
44037 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
44038 VT.getVectorElementType() == MVT::i1) {
44039 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
44040 if (C->isAllOnes())
44041 return DAG.getConstant(1, SDLoc(N0), VT);
44042 if (C->isZero())
44043 return DAG.getConstant(0, SDLoc(N0), VT);
44044 }
44045 }
44046
44047 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
44048 // Turn it into a sign bit compare that produces a k-register. This avoids
44049 // a trip through a GPR.
44050 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
44051 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
44053 unsigned NumElts = VT.getVectorNumElements();
44054 SDValue Src = N0;
44055
44056 // Peek through truncate.
44057 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
44058 Src = N0.getOperand(0);
44059
44060 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
44061 SDValue MovmskIn = Src.getOperand(0);
44062 MVT MovmskVT = MovmskIn.getSimpleValueType();
44063 unsigned MovMskElts = MovmskVT.getVectorNumElements();
44064
44065 // We allow extra bits of the movmsk to be used since they are known zero.
44066 // We can't convert a VPMOVMSKB without avx512bw.
44067 if (MovMskElts <= NumElts &&
44068 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
44069 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
44070 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
44071 SDLoc dl(N);
44072 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
44073 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
44074 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
44075 if (EVT(CmpVT) == VT)
44076 return Cmp;
44077
44078 // Pad with zeroes up to original VT to replace the zeroes that were
44079 // being used from the MOVMSK.
44080 unsigned NumConcats = NumElts / MovMskElts;
44081 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
44082 Ops[0] = Cmp;
44083 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
44084 }
44085 }
44086 }
44087
44088 // Try to remove bitcasts from input and output of mask arithmetic to
44089 // remove GPR<->K-register crossings.
44090 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
44091 return V;
44092
44093 // Convert a bitcasted integer logic operation that has one bitcasted
44094 // floating-point operand into a floating-point logic operation. This may
44095 // create a load of a constant, but that is cheaper than materializing the
44096 // constant in an integer register and transferring it to an SSE register or
44097 // transferring the SSE operand to integer register and back.
44098 unsigned FPOpcode;
44099 switch (N0.getOpcode()) {
44100 // clang-format off
44101 case ISD::AND: FPOpcode = X86ISD::FAND; break;
44102 case ISD::OR: FPOpcode = X86ISD::FOR; break;
44103 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
44104 default: return SDValue();
44105 // clang-format on
44106 }
44107
44108 // Check if we have a bitcast from another integer type as well.
44109 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
44110 (Subtarget.hasSSE2() && VT == MVT::f64) ||
44111 (Subtarget.hasFP16() && VT == MVT::f16) ||
44112 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
44113 TLI.isTypeLegal(VT))))
44114 return SDValue();
44115
44116 SDValue LogicOp0 = N0.getOperand(0);
44117 SDValue LogicOp1 = N0.getOperand(1);
44118 SDLoc DL0(N0);
44119
44120 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
44121 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
44122 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
44123 LogicOp0.getOperand(0).getValueType() == VT &&
44124 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
44125 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
44126 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
44127 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
44128 }
44129 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
44130 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
44131 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
44132 LogicOp1.getOperand(0).getValueType() == VT &&
44133 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
44134 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
44135 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
44136 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
44137 }
44138
44139 return SDValue();
44140}
44141
44142// (mul (zext a), (sext, b))
44143static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
44144 SDValue &Op1) {
44145 Op0 = Mul.getOperand(0);
44146 Op1 = Mul.getOperand(1);
44147
44148 // The operand1 should be signed extend
44149 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
44150 std::swap(Op0, Op1);
44151
44152 auto IsFreeTruncation = [](SDValue &Op) -> bool {
44153 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
44154 Op.getOpcode() == ISD::SIGN_EXTEND) &&
44155 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
44156 return true;
44157
44158 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
44159 return (BV && BV->isConstant());
44160 };
44161
44162 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
44163 // value, we need to check Op0 is zero extended value. Op1 should be signed
44164 // value, so we just check the signed bits.
44165 if ((IsFreeTruncation(Op0) &&
44166 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
44167 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
44168 return true;
44169
44170 return false;
44171}
44172
44173// Given a ABS node, detect the following pattern:
44174// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
44175// This is useful as it is the input into a SAD pattern.
44176static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
44177 SDValue AbsOp1 = Abs->getOperand(0);
44178 if (AbsOp1.getOpcode() != ISD::SUB)
44179 return false;
44180
44181 Op0 = AbsOp1.getOperand(0);
44182 Op1 = AbsOp1.getOperand(1);
44183
44184 // Check if the operands of the sub are zero-extended from vectors of i8.
44185 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
44186 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
44187 Op1.getOpcode() != ISD::ZERO_EXTEND ||
44188 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
44189 return false;
44190
44191 return true;
44192}
44193
44195 unsigned &LogBias, const SDLoc &DL,
44196 const X86Subtarget &Subtarget) {
44197 // Extend or truncate to MVT::i8 first.
44198 MVT Vi8VT =
44199 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
44200 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
44201 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
44202
44203 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
44204 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
44205 // The src A, B element type is i8, but the dst C element type is i32.
44206 // When we calculate the reduce stage, we use src vector type vXi8 for it
44207 // so we need logbias 2 to avoid extra 2 stages.
44208 LogBias = 2;
44209
44210 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
44211 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
44212 RegSize = std::max(512u, RegSize);
44213
44214 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44215 // fill in the missing vector elements with 0.
44216 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
44217 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
44218 Ops[0] = LHS;
44219 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
44220 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44221 Ops[0] = RHS;
44222 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44223
44224 // Actually build the DotProduct, split as 256/512 bits for
44225 // AVXVNNI/AVX512VNNI.
44226 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44227 ArrayRef<SDValue> Ops) {
44228 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
44229 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
44230 };
44231 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
44232 SDValue Zero = DAG.getConstant(0, DL, DpVT);
44233
44234 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
44235 DpBuilder, false);
44236}
44237
44238// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
44239// to these zexts.
44240static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
44241 const SDValue &Zext1, const SDLoc &DL,
44242 const X86Subtarget &Subtarget) {
44243 // Find the appropriate width for the PSADBW.
44244 EVT InVT = Zext0.getOperand(0).getValueType();
44245 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
44246
44247 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44248 // fill in the missing vector elements with 0.
44249 unsigned NumConcat = RegSize / InVT.getSizeInBits();
44250 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
44251 Ops[0] = Zext0.getOperand(0);
44252 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
44253 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44254 Ops[0] = Zext1.getOperand(0);
44255 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44256
44257 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
44258 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44259 ArrayRef<SDValue> Ops) {
44260 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
44261 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
44262 };
44263 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
44264 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
44265 PSADBWBuilder);
44266}
44267
44268// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
44269// PHMINPOSUW.
44271 const X86Subtarget &Subtarget) {
44272 // Bail without SSE41.
44273 if (!Subtarget.hasSSE41())
44274 return SDValue();
44275
44276 EVT ExtractVT = Extract->getValueType(0);
44277 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
44278 return SDValue();
44279
44280 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
44281 ISD::NodeType BinOp;
44282 SDValue Src = DAG.matchBinOpReduction(
44283 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
44284 if (!Src)
44285 return SDValue();
44286
44287 EVT SrcVT = Src.getValueType();
44288 EVT SrcSVT = SrcVT.getScalarType();
44289 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
44290 return SDValue();
44291
44292 SDLoc DL(Extract);
44293 SDValue MinPos = Src;
44294
44295 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
44296 while (SrcVT.getSizeInBits() > 128) {
44297 SDValue Lo, Hi;
44298 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
44299 SrcVT = Lo.getValueType();
44300 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
44301 }
44302 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
44303 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
44304 "Unexpected value type");
44305
44306 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
44307 // to flip the value accordingly.
44308 SDValue Mask;
44309 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
44310 if (BinOp == ISD::SMAX)
44311 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
44312 else if (BinOp == ISD::SMIN)
44313 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
44314 else if (BinOp == ISD::UMAX)
44315 Mask = DAG.getAllOnesConstant(DL, SrcVT);
44316
44317 if (Mask)
44318 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
44319
44320 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
44321 // shuffling each upper element down and insert zeros. This means that the
44322 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
44323 // ready for the PHMINPOS.
44324 if (ExtractVT == MVT::i8) {
44326 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
44327 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
44328 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
44329 }
44330
44331 // Perform the PHMINPOS on a v8i16 vector,
44332 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
44333 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
44334 MinPos = DAG.getBitcast(SrcVT, MinPos);
44335
44336 if (Mask)
44337 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
44338
44339 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
44340 DAG.getIntPtrConstant(0, DL));
44341}
44342
44343// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
44345 const X86Subtarget &Subtarget) {
44346 // Bail without SSE2.
44347 if (!Subtarget.hasSSE2())
44348 return SDValue();
44349
44350 EVT ExtractVT = Extract->getValueType(0);
44351 unsigned BitWidth = ExtractVT.getSizeInBits();
44352 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
44353 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
44354 return SDValue();
44355
44356 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
44357 ISD::NodeType BinOp;
44358 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
44359 if (!Match && ExtractVT == MVT::i1)
44360 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
44361 if (!Match)
44362 return SDValue();
44363
44364 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
44365 // which we can't support here for now.
44366 if (Match.getScalarValueSizeInBits() != BitWidth)
44367 return SDValue();
44368
44369 SDValue Movmsk;
44370 SDLoc DL(Extract);
44371 EVT MatchVT = Match.getValueType();
44372 unsigned NumElts = MatchVT.getVectorNumElements();
44373 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
44374 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44375 LLVMContext &Ctx = *DAG.getContext();
44376
44377 if (ExtractVT == MVT::i1) {
44378 // Special case for (pre-legalization) vXi1 reductions.
44379 if (NumElts > 64 || !isPowerOf2_32(NumElts))
44380 return SDValue();
44381 if (Match.getOpcode() == ISD::SETCC) {
44382 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
44383 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
44384 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
44385 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
44386 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
44387 X86::CondCode X86CC;
44388 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
44389 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
44390 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
44391 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
44392 DAG, X86CC))
44393 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
44394 getSETCC(X86CC, V, DL, DAG));
44395 }
44396 }
44397 if (TLI.isTypeLegal(MatchVT)) {
44398 // If this is a legal AVX512 predicate type then we can just bitcast.
44399 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
44400 Movmsk = DAG.getBitcast(MovmskVT, Match);
44401 } else {
44402 // Use combineBitcastvxi1 to create the MOVMSK.
44403 while (NumElts > MaxElts) {
44404 SDValue Lo, Hi;
44405 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
44406 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
44407 NumElts /= 2;
44408 }
44409 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
44410 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
44411 }
44412 if (!Movmsk)
44413 return SDValue();
44414 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
44415 } else {
44416 // FIXME: Better handling of k-registers or 512-bit vectors?
44417 unsigned MatchSizeInBits = Match.getValueSizeInBits();
44418 if (!(MatchSizeInBits == 128 ||
44419 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
44420 return SDValue();
44421
44422 // Make sure this isn't a vector of 1 element. The perf win from using
44423 // MOVMSK diminishes with less elements in the reduction, but it is
44424 // generally better to get the comparison over to the GPRs as soon as
44425 // possible to reduce the number of vector ops.
44426 if (Match.getValueType().getVectorNumElements() < 2)
44427 return SDValue();
44428
44429 // Check that we are extracting a reduction of all sign bits.
44430 if (DAG.ComputeNumSignBits(Match) != BitWidth)
44431 return SDValue();
44432
44433 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
44434 SDValue Lo, Hi;
44435 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
44436 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
44437 MatchSizeInBits = Match.getValueSizeInBits();
44438 }
44439
44440 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
44441 MVT MaskSrcVT;
44442 if (64 == BitWidth || 32 == BitWidth)
44444 MatchSizeInBits / BitWidth);
44445 else
44446 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
44447
44448 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
44449 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
44450 NumElts = MaskSrcVT.getVectorNumElements();
44451 }
44452 assert((NumElts <= 32 || NumElts == 64) &&
44453 "Not expecting more than 64 elements");
44454
44455 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
44456 if (BinOp == ISD::XOR) {
44457 // parity -> (PARITY(MOVMSK X))
44458 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
44459 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
44460 }
44461
44462 SDValue CmpC;
44463 ISD::CondCode CondCode;
44464 if (BinOp == ISD::OR) {
44465 // any_of -> MOVMSK != 0
44466 CmpC = DAG.getConstant(0, DL, CmpVT);
44467 CondCode = ISD::CondCode::SETNE;
44468 } else {
44469 // all_of -> MOVMSK == ((1 << NumElts) - 1)
44470 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
44471 DL, CmpVT);
44472 CondCode = ISD::CondCode::SETEQ;
44473 }
44474
44475 // The setcc produces an i8 of 0/1, so extend that to the result width and
44476 // negate to get the final 0/-1 mask value.
44477 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
44478 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
44479 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
44480 return DAG.getNegative(Zext, DL, ExtractVT);
44481}
44482
44484 const X86Subtarget &Subtarget) {
44485 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
44486 return SDValue();
44487
44488 EVT ExtractVT = Extract->getValueType(0);
44489 // Verify the type we're extracting is i32, as the output element type of
44490 // vpdpbusd is i32.
44491 if (ExtractVT != MVT::i32)
44492 return SDValue();
44493
44494 EVT VT = Extract->getOperand(0).getValueType();
44496 return SDValue();
44497
44498 // Match shuffle + add pyramid.
44499 ISD::NodeType BinOp;
44500 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
44501
44502 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
44503 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
44504 // before adding into the accumulator.
44505 // TODO:
44506 // We also need to verify that the multiply has at least 2x the number of bits
44507 // of the input. We shouldn't match
44508 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
44509 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
44510 // Root = Root.getOperand(0);
44511
44512 // If there was a match, we want Root to be a mul.
44513 if (!Root || Root.getOpcode() != ISD::MUL)
44514 return SDValue();
44515
44516 // Check whether we have an extend and mul pattern
44517 SDValue LHS, RHS;
44518 if (!detectExtMul(DAG, Root, LHS, RHS))
44519 return SDValue();
44520
44521 // Create the dot product instruction.
44522 SDLoc DL(Extract);
44523 unsigned StageBias;
44524 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
44525
44526 // If the original vector was wider than 4 elements, sum over the results
44527 // in the DP vector.
44528 unsigned Stages = Log2_32(VT.getVectorNumElements());
44529 EVT DpVT = DP.getValueType();
44530
44531 if (Stages > StageBias) {
44532 unsigned DpElems = DpVT.getVectorNumElements();
44533
44534 for (unsigned i = Stages - StageBias; i > 0; --i) {
44535 SmallVector<int, 16> Mask(DpElems, -1);
44536 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44537 Mask[j] = MaskEnd + j;
44538
44539 SDValue Shuffle =
44540 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
44541 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
44542 }
44543 }
44544
44545 // Return the lowest ExtractSizeInBits bits.
44546 EVT ResVT =
44547 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44548 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
44549 DP = DAG.getBitcast(ResVT, DP);
44550 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
44551 Extract->getOperand(1));
44552}
44553
44555 const X86Subtarget &Subtarget) {
44556 // PSADBW is only supported on SSE2 and up.
44557 if (!Subtarget.hasSSE2())
44558 return SDValue();
44559
44560 EVT ExtractVT = Extract->getValueType(0);
44561 // Verify the type we're extracting is either i32 or i64.
44562 // FIXME: Could support other types, but this is what we have coverage for.
44563 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
44564 return SDValue();
44565
44566 EVT VT = Extract->getOperand(0).getValueType();
44568 return SDValue();
44569
44570 // Match shuffle + add pyramid.
44571 ISD::NodeType BinOp;
44572 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
44573
44574 // The operand is expected to be zero extended from i8
44575 // (verified in detectZextAbsDiff).
44576 // In order to convert to i64 and above, additional any/zero/sign
44577 // extend is expected.
44578 // The zero extend from 32 bit has no mathematical effect on the result.
44579 // Also the sign extend is basically zero extend
44580 // (extends the sign bit which is zero).
44581 // So it is correct to skip the sign/zero extend instruction.
44582 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
44583 Root.getOpcode() == ISD::ZERO_EXTEND ||
44584 Root.getOpcode() == ISD::ANY_EXTEND))
44585 Root = Root.getOperand(0);
44586
44587 // If there was a match, we want Root to be a select that is the root of an
44588 // abs-diff pattern.
44589 if (!Root || Root.getOpcode() != ISD::ABS)
44590 return SDValue();
44591
44592 // Check whether we have an abs-diff pattern feeding into the select.
44593 SDValue Zext0, Zext1;
44594 if (!detectZextAbsDiff(Root, Zext0, Zext1))
44595 return SDValue();
44596
44597 // Create the SAD instruction.
44598 SDLoc DL(Extract);
44599 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
44600
44601 // If the original vector was wider than 8 elements, sum over the results
44602 // in the SAD vector.
44603 unsigned Stages = Log2_32(VT.getVectorNumElements());
44604 EVT SadVT = SAD.getValueType();
44605 if (Stages > 3) {
44606 unsigned SadElems = SadVT.getVectorNumElements();
44607
44608 for(unsigned i = Stages - 3; i > 0; --i) {
44609 SmallVector<int, 16> Mask(SadElems, -1);
44610 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44611 Mask[j] = MaskEnd + j;
44612
44613 SDValue Shuffle =
44614 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
44615 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
44616 }
44617 }
44618
44619 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
44620 // Return the lowest ExtractSizeInBits bits.
44621 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44622 SadVT.getSizeInBits() / ExtractSizeInBits);
44623 SAD = DAG.getBitcast(ResVT, SAD);
44624 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
44625 Extract->getOperand(1));
44626}
44627
44628// If this extract is from a loaded vector value and will be used as an
44629// integer, that requires a potentially expensive XMM -> GPR transfer.
44630// Additionally, if we can convert to a scalar integer load, that will likely
44631// be folded into a subsequent integer op.
44632// Note: SrcVec might not have a VecVT type, but it must be the same size.
44633// Note: Unlike the related fold for this in DAGCombiner, this is not limited
44634// to a single-use of the loaded vector. For the reasons above, we
44635// expect this to be profitable even if it creates an extra load.
44636static SDValue
44638 const SDLoc &dl, SelectionDAG &DAG,
44640 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44641 "Only EXTRACT_VECTOR_ELT supported so far");
44642
44643 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44644 EVT VT = N->getValueType(0);
44645
44646 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
44647 return Use->getOpcode() == ISD::STORE ||
44648 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
44649 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
44650 });
44651
44652 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
44653 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
44654 VecVT.getVectorElementType() == VT &&
44655 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
44656 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
44657 SDValue NewPtr = TLI.getVectorElementPointer(
44658 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
44659 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
44660 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
44661 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
44662 SDValue Load =
44663 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
44664 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
44665 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
44666 return Load;
44667 }
44668
44669 return SDValue();
44670}
44671
44672// Attempt to peek through a target shuffle and extract the scalar from the
44673// source.
44676 const X86Subtarget &Subtarget) {
44677 if (DCI.isBeforeLegalizeOps())
44678 return SDValue();
44679
44680 SDLoc dl(N);
44681 SDValue Src = N->getOperand(0);
44682 SDValue Idx = N->getOperand(1);
44683
44684 EVT VT = N->getValueType(0);
44685 EVT SrcVT = Src.getValueType();
44686 EVT SrcSVT = SrcVT.getVectorElementType();
44687 unsigned SrcEltBits = SrcSVT.getSizeInBits();
44688 unsigned NumSrcElts = SrcVT.getVectorNumElements();
44689
44690 // Don't attempt this for boolean mask vectors or unknown extraction indices.
44691 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
44692 return SDValue();
44693
44694 const APInt &IdxC = N->getConstantOperandAPInt(1);
44695 if (IdxC.uge(NumSrcElts))
44696 return SDValue();
44697
44698 SDValue SrcBC = peekThroughBitcasts(Src);
44699
44700 // Handle extract(bitcast(broadcast(scalar_value))).
44701 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
44702 SDValue SrcOp = SrcBC.getOperand(0);
44703 EVT SrcOpVT = SrcOp.getValueType();
44704 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
44705 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
44706 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
44707 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
44708 // TODO support non-zero offsets.
44709 if (Offset == 0) {
44710 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
44711 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
44712 return SrcOp;
44713 }
44714 }
44715 }
44716
44717 // If we're extracting a single element from a broadcast load and there are
44718 // no other users, just create a single load.
44719 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
44720 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
44721 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
44722 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
44723 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
44724 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
44725 MemIntr->getBasePtr(),
44726 MemIntr->getPointerInfo(),
44727 MemIntr->getOriginalAlign(),
44728 MemIntr->getMemOperand()->getFlags());
44729 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
44730 return Load;
44731 }
44732 }
44733
44734 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
44735 // TODO: Move to DAGCombine?
44736 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
44737 SrcBC.getValueType().isInteger() &&
44738 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
44739 SrcBC.getScalarValueSizeInBits() ==
44740 SrcBC.getOperand(0).getValueSizeInBits()) {
44741 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
44742 if (IdxC.ult(Scale)) {
44743 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
44744 SDValue Scl = SrcBC.getOperand(0);
44745 EVT SclVT = Scl.getValueType();
44746 if (Offset) {
44747 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
44748 DAG.getShiftAmountConstant(Offset, SclVT, dl));
44749 }
44750 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
44751 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
44752 return Scl;
44753 }
44754 }
44755
44756 // Handle extract(truncate(x)) for 0'th index.
44757 // TODO: Treat this as a faux shuffle?
44758 // TODO: When can we use this for general indices?
44759 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
44760 (SrcVT.getSizeInBits() % 128) == 0) {
44761 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
44762 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
44763 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
44764 Idx);
44765 }
44766
44767 // We can only legally extract other elements from 128-bit vectors and in
44768 // certain circumstances, depending on SSE-level.
44769 // TODO: Investigate float/double extraction if it will be just stored.
44770 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
44771 unsigned Idx) {
44772 EVT VecSVT = VecVT.getScalarType();
44773 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
44774 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
44775 VecSVT == MVT::i64)) {
44776 unsigned EltSizeInBits = VecSVT.getSizeInBits();
44777 unsigned NumEltsPerLane = 128 / EltSizeInBits;
44778 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
44779 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
44780 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
44781 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
44782 Idx &= (NumEltsPerLane - 1);
44783 }
44784 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
44785 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
44786 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
44787 DAG.getBitcast(VecVT, Vec),
44788 DAG.getIntPtrConstant(Idx, dl));
44789 }
44790 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
44791 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
44792 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
44793 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
44794 DAG.getTargetConstant(Idx, dl, MVT::i8));
44795 }
44796 return SDValue();
44797 };
44798
44799 // Resolve the target shuffle inputs and mask.
44802 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
44803 return SDValue();
44804
44805 // Shuffle inputs must be the same size as the result.
44806 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
44807 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
44808 }))
44809 return SDValue();
44810
44811 // Attempt to narrow/widen the shuffle mask to the correct size.
44812 if (Mask.size() != NumSrcElts) {
44813 if ((NumSrcElts % Mask.size()) == 0) {
44814 SmallVector<int, 16> ScaledMask;
44815 int Scale = NumSrcElts / Mask.size();
44816 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
44817 Mask = std::move(ScaledMask);
44818 } else if ((Mask.size() % NumSrcElts) == 0) {
44819 // Simplify Mask based on demanded element.
44820 int ExtractIdx = (int)IdxC.getZExtValue();
44821 int Scale = Mask.size() / NumSrcElts;
44822 int Lo = Scale * ExtractIdx;
44823 int Hi = Scale * (ExtractIdx + 1);
44824 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
44825 if (i < Lo || Hi <= i)
44826 Mask[i] = SM_SentinelUndef;
44827
44828 SmallVector<int, 16> WidenedMask;
44829 while (Mask.size() > NumSrcElts &&
44830 canWidenShuffleElements(Mask, WidenedMask))
44831 Mask = std::move(WidenedMask);
44832 }
44833 }
44834
44835 // If narrowing/widening failed, see if we can extract+zero-extend.
44836 int ExtractIdx;
44837 EVT ExtractVT;
44838 if (Mask.size() == NumSrcElts) {
44839 ExtractIdx = Mask[IdxC.getZExtValue()];
44840 ExtractVT = SrcVT;
44841 } else {
44842 unsigned Scale = Mask.size() / NumSrcElts;
44843 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
44844 return SDValue();
44845 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
44846 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
44847 return SDValue();
44848 ExtractIdx = Mask[ScaledIdx];
44849 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
44850 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
44851 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
44852 "Failed to widen vector type");
44853 }
44854
44855 // If the shuffle source element is undef/zero then we can just accept it.
44856 if (ExtractIdx == SM_SentinelUndef)
44857 return DAG.getUNDEF(VT);
44858
44859 if (ExtractIdx == SM_SentinelZero)
44860 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
44861 : DAG.getConstant(0, dl, VT);
44862
44863 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
44864 ExtractIdx = ExtractIdx % Mask.size();
44865 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
44866 return DAG.getZExtOrTrunc(V, dl, VT);
44867
44868 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
44870 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
44871 return V;
44872
44873 return SDValue();
44874}
44875
44876/// Extracting a scalar FP value from vector element 0 is free, so extract each
44877/// operand first, then perform the math as a scalar op.
44879 const X86Subtarget &Subtarget) {
44880 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
44881 SDValue Vec = ExtElt->getOperand(0);
44882 SDValue Index = ExtElt->getOperand(1);
44883 EVT VT = ExtElt->getValueType(0);
44884 EVT VecVT = Vec.getValueType();
44885
44886 // TODO: If this is a unary/expensive/expand op, allow extraction from a
44887 // non-zero element because the shuffle+scalar op will be cheaper?
44888 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
44889 return SDValue();
44890
44891 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
44892 // extract, the condition code), so deal with those as a special-case.
44893 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
44894 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
44895 if (OpVT != MVT::f32 && OpVT != MVT::f64)
44896 return SDValue();
44897
44898 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
44899 SDLoc DL(ExtElt);
44900 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44901 Vec.getOperand(0), Index);
44902 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44903 Vec.getOperand(1), Index);
44904 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
44905 }
44906
44907 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
44908 VT != MVT::f64)
44909 return SDValue();
44910
44911 // Vector FP selects don't fit the pattern of FP math ops (because the
44912 // condition has a different type and we have to change the opcode), so deal
44913 // with those here.
44914 // FIXME: This is restricted to pre type legalization by ensuring the setcc
44915 // has i1 elements. If we loosen this we need to convert vector bool to a
44916 // scalar bool.
44917 if (Vec.getOpcode() == ISD::VSELECT &&
44918 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
44919 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
44920 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
44921 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
44922 SDLoc DL(ExtElt);
44925 Vec.getOperand(0), Index);
44926 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44927 Vec.getOperand(1), Index);
44928 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44929 Vec.getOperand(2), Index);
44930 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
44931 }
44932
44933 // TODO: This switch could include FNEG and the x86-specific FP logic ops
44934 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
44935 // missed load folding and fma+fneg combining.
44936 switch (Vec.getOpcode()) {
44937 case ISD::FMA: // Begin 3 operands
44938 case ISD::FMAD:
44939 case ISD::FADD: // Begin 2 operands
44940 case ISD::FSUB:
44941 case ISD::FMUL:
44942 case ISD::FDIV:
44943 case ISD::FREM:
44944 case ISD::FCOPYSIGN:
44945 case ISD::FMINNUM:
44946 case ISD::FMAXNUM:
44947 case ISD::FMINNUM_IEEE:
44948 case ISD::FMAXNUM_IEEE:
44949 case ISD::FMAXIMUM:
44950 case ISD::FMINIMUM:
44951 case X86ISD::FMAX:
44952 case X86ISD::FMIN:
44953 case ISD::FABS: // Begin 1 operand
44954 case ISD::FSQRT:
44955 case ISD::FRINT:
44956 case ISD::FCEIL:
44957 case ISD::FTRUNC:
44958 case ISD::FNEARBYINT:
44959 case ISD::FROUNDEVEN:
44960 case ISD::FROUND:
44961 case ISD::FFLOOR:
44962 case X86ISD::FRCP:
44963 case X86ISD::FRSQRT: {
44964 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
44965 SDLoc DL(ExtElt);
44967 for (SDValue Op : Vec->ops())
44968 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
44969 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
44970 }
44971 default:
44972 return SDValue();
44973 }
44974 llvm_unreachable("All opcodes should return within switch");
44975}
44976
44977/// Try to convert a vector reduction sequence composed of binops and shuffles
44978/// into horizontal ops.
44980 const X86Subtarget &Subtarget) {
44981 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
44982
44983 // We need at least SSE2 to anything here.
44984 if (!Subtarget.hasSSE2())
44985 return SDValue();
44986
44987 ISD::NodeType Opc;
44988 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
44989 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
44990 if (!Rdx)
44991 return SDValue();
44992
44993 SDValue Index = ExtElt->getOperand(1);
44995 "Reduction doesn't end in an extract from index 0");
44996
44997 EVT VT = ExtElt->getValueType(0);
44998 EVT VecVT = Rdx.getValueType();
44999 if (VecVT.getScalarType() != VT)
45000 return SDValue();
45001
45002 SDLoc DL(ExtElt);
45003 unsigned NumElts = VecVT.getVectorNumElements();
45004 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
45005
45006 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
45007 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
45008 if (V.getValueType() == MVT::v4i8) {
45009 if (ZeroExtend && Subtarget.hasSSE41()) {
45010 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
45011 DAG.getConstant(0, DL, MVT::v4i32),
45012 DAG.getBitcast(MVT::i32, V),
45013 DAG.getIntPtrConstant(0, DL));
45014 return DAG.getBitcast(MVT::v16i8, V);
45015 }
45016 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
45017 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
45018 : DAG.getUNDEF(MVT::v4i8));
45019 }
45020 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
45021 DAG.getUNDEF(MVT::v8i8));
45022 };
45023
45024 // vXi8 mul reduction - promote to vXi16 mul reduction.
45025 if (Opc == ISD::MUL) {
45026 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
45027 return SDValue();
45028 if (VecVT.getSizeInBits() >= 128) {
45029 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
45030 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
45031 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
45032 Lo = DAG.getBitcast(WideVT, Lo);
45033 Hi = DAG.getBitcast(WideVT, Hi);
45034 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
45035 while (Rdx.getValueSizeInBits() > 128) {
45036 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45037 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
45038 }
45039 } else {
45040 Rdx = WidenToV16I8(Rdx, false);
45041 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
45042 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
45043 }
45044 if (NumElts >= 8)
45045 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45046 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45047 {4, 5, 6, 7, -1, -1, -1, -1}));
45048 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45049 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45050 {2, 3, -1, -1, -1, -1, -1, -1}));
45051 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
45052 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
45053 {1, -1, -1, -1, -1, -1, -1, -1}));
45054 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45055 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45056 }
45057
45058 // vXi8 add reduction - sub 128-bit vector.
45059 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
45060 Rdx = WidenToV16I8(Rdx, true);
45061 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
45062 DAG.getConstant(0, DL, MVT::v16i8));
45063 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45064 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45065 }
45066
45067 // Must be a >=128-bit vector with pow2 elements.
45068 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
45069 return SDValue();
45070
45071 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
45072 if (VT == MVT::i8) {
45073 while (Rdx.getValueSizeInBits() > 128) {
45074 SDValue Lo, Hi;
45075 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45076 VecVT = Lo.getValueType();
45077 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45078 }
45079 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
45080
45082 MVT::v16i8, DL, Rdx, Rdx,
45083 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
45084 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
45085 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
45086 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
45087 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45088 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45089 }
45090
45091 // See if we can use vXi8 PSADBW add reduction for larger zext types.
45092 // If the source vector values are 0-255, then we can use PSADBW to
45093 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
45094 // TODO: See if its worth avoiding vXi16/i32 truncations?
45095 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
45096 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
45097 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
45098 Subtarget.hasAVX512())) {
45099 if (Rdx.getValueType() == MVT::v8i16) {
45100 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
45101 DAG.getUNDEF(MVT::v8i16));
45102 } else {
45103 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
45104 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
45105 if (ByteVT.getSizeInBits() < 128)
45106 Rdx = WidenToV16I8(Rdx, true);
45107 }
45108
45109 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45110 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45111 ArrayRef<SDValue> Ops) {
45112 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45113 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
45114 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
45115 };
45116 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
45117 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
45118
45119 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
45120 while (Rdx.getValueSizeInBits() > 128) {
45121 SDValue Lo, Hi;
45122 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45123 VecVT = Lo.getValueType();
45124 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45125 }
45126 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
45127
45128 if (NumElts > 8) {
45129 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
45130 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
45131 }
45132
45133 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
45134 Rdx = DAG.getBitcast(VecVT, Rdx);
45135 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45136 }
45137
45138 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
45139 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
45140 return SDValue();
45141
45142 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
45143
45144 // 256-bit horizontal instructions operate on 128-bit chunks rather than
45145 // across the whole vector, so we need an extract + hop preliminary stage.
45146 // This is the only step where the operands of the hop are not the same value.
45147 // TODO: We could extend this to handle 512-bit or even longer vectors.
45148 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
45149 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
45150 unsigned NumElts = VecVT.getVectorNumElements();
45151 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
45152 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
45153 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
45154 VecVT = Rdx.getValueType();
45155 }
45156 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
45157 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
45158 return SDValue();
45159
45160 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
45161 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
45162 for (unsigned i = 0; i != ReductionSteps; ++i)
45163 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
45164
45165 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45166}
45167
45168/// Detect vector gather/scatter index generation and convert it from being a
45169/// bunch of shuffles and extracts into a somewhat faster sequence.
45170/// For i686, the best sequence is apparently storing the value and loading
45171/// scalars back, while for x64 we should use 64-bit extracts and shifts.
45174 const X86Subtarget &Subtarget) {
45175 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
45176 return NewOp;
45177
45178 SDValue InputVector = N->getOperand(0);
45179 SDValue EltIdx = N->getOperand(1);
45180 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
45181
45182 EVT SrcVT = InputVector.getValueType();
45183 EVT VT = N->getValueType(0);
45184 SDLoc dl(InputVector);
45185 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
45186 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45187 unsigned NumEltBits = VT.getScalarSizeInBits();
45188 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45189
45190 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
45191 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
45192
45193 // Integer Constant Folding.
45194 if (CIdx && VT.isInteger()) {
45195 APInt UndefVecElts;
45196 SmallVector<APInt, 16> EltBits;
45197 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
45198 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
45199 EltBits, /*AllowWholeUndefs*/ true,
45200 /*AllowPartialUndefs*/ false)) {
45201 uint64_t Idx = CIdx->getZExtValue();
45202 if (UndefVecElts[Idx])
45203 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
45204 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
45205 }
45206
45207 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
45208 // Improves lowering of bool masks on rust which splits them into byte array.
45209 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
45210 SDValue Src = peekThroughBitcasts(InputVector);
45211 if (Src.getValueType().getScalarType() == MVT::i1 &&
45212 TLI.isTypeLegal(Src.getValueType())) {
45213 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
45214 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
45215 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
45216 return DAG.getBitcast(VT, Sub);
45217 }
45218 }
45219 }
45220
45221 if (IsPextr) {
45222 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
45223 DCI))
45224 return SDValue(N, 0);
45225
45226 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
45227 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
45228 InputVector.getOpcode() == X86ISD::PINSRW) &&
45229 InputVector.getOperand(2) == EltIdx) {
45230 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
45231 "Vector type mismatch");
45232 SDValue Scl = InputVector.getOperand(1);
45233 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
45234 return DAG.getZExtOrTrunc(Scl, dl, VT);
45235 }
45236
45237 // TODO - Remove this once we can handle the implicit zero-extension of
45238 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
45239 // combineBasicSADPattern.
45240 return SDValue();
45241 }
45242
45243 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
45244 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
45245 InputVector.getOpcode() == ISD::BITCAST &&
45246 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
45247 isNullConstant(EltIdx) && InputVector.hasOneUse())
45248 return DAG.getBitcast(VT, InputVector);
45249
45250 // Detect mmx to i32 conversion through a v2i32 elt extract.
45251 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
45252 InputVector.getOpcode() == ISD::BITCAST &&
45253 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
45254 isNullConstant(EltIdx) && InputVector.hasOneUse())
45255 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
45256 InputVector.getOperand(0));
45257
45258 // Check whether this extract is the root of a sum of absolute differences
45259 // pattern. This has to be done here because we really want it to happen
45260 // pre-legalization,
45261 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
45262 return SAD;
45263
45264 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
45265 return VPDPBUSD;
45266
45267 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
45268 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
45269 return Cmp;
45270
45271 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
45272 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
45273 return MinMax;
45274
45275 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
45276 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
45277 return V;
45278
45279 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
45280 return V;
45281
45282 if (CIdx)
45284 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
45285 dl, DAG, DCI))
45286 return V;
45287
45288 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
45289 // and then testing the relevant element.
45290 //
45291 // Note that we only combine extracts on the *same* result number, i.e.
45292 // t0 = merge_values a0, a1, a2, a3
45293 // i1 = extract_vector_elt t0, Constant:i64<2>
45294 // i1 = extract_vector_elt t0, Constant:i64<3>
45295 // but not
45296 // i1 = extract_vector_elt t0:1, Constant:i64<2>
45297 // since the latter would need its own MOVMSK.
45298 if (SrcVT.getScalarType() == MVT::i1) {
45299 bool IsVar = !CIdx;
45300 SmallVector<SDNode *, 16> BoolExtracts;
45301 unsigned ResNo = InputVector.getResNo();
45302 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
45303 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45304 Use->getOperand(0).getResNo() == ResNo &&
45305 Use->getValueType(0) == MVT::i1) {
45306 BoolExtracts.push_back(Use);
45307 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
45308 return true;
45309 }
45310 return false;
45311 };
45312 // TODO: Can we drop the oneuse check for constant extracts?
45313 if (all_of(InputVector->uses(), IsBoolExtract) &&
45314 (IsVar || BoolExtracts.size() > 1)) {
45315 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
45316 if (SDValue BC =
45317 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
45318 for (SDNode *Use : BoolExtracts) {
45319 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
45320 // Mask = 1 << MaskIdx
45321 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
45322 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
45323 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
45324 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
45325 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
45326 DCI.CombineTo(Use, Res);
45327 }
45328 return SDValue(N, 0);
45329 }
45330 }
45331 }
45332
45333 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
45334 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
45335 SDValue TruncSrc = InputVector.getOperand(0);
45336 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
45337 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
45338 SDValue NewExt =
45339 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
45340 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
45341 }
45342 }
45343
45344 return SDValue();
45345}
45346
45347// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
45348// This is more or less the reverse of combineBitcastvxi1.
45350 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
45351 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
45352 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
45353 Opcode != ISD::ANY_EXTEND)
45354 return SDValue();
45355 if (!DCI.isBeforeLegalizeOps())
45356 return SDValue();
45357 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
45358 return SDValue();
45359
45360 EVT SVT = VT.getScalarType();
45361 EVT InSVT = N0.getValueType().getScalarType();
45362 unsigned EltSizeInBits = SVT.getSizeInBits();
45363
45364 // Input type must be extending a bool vector (bit-casted from a scalar
45365 // integer) to legal integer types.
45366 if (!VT.isVector())
45367 return SDValue();
45368 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
45369 return SDValue();
45370 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
45371 return SDValue();
45372
45373 SDValue N00 = N0.getOperand(0);
45374 EVT SclVT = N00.getValueType();
45375 if (!SclVT.isScalarInteger())
45376 return SDValue();
45377
45378 SDValue Vec;
45379 SmallVector<int> ShuffleMask;
45380 unsigned NumElts = VT.getVectorNumElements();
45381 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
45382
45383 // Broadcast the scalar integer to the vector elements.
45384 if (NumElts > EltSizeInBits) {
45385 // If the scalar integer is greater than the vector element size, then we
45386 // must split it down into sub-sections for broadcasting. For example:
45387 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
45388 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
45389 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
45390 unsigned Scale = NumElts / EltSizeInBits;
45391 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
45392 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
45393 Vec = DAG.getBitcast(VT, Vec);
45394
45395 for (unsigned i = 0; i != Scale; ++i)
45396 ShuffleMask.append(EltSizeInBits, i);
45397 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
45398 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
45399 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
45400 // If we have register broadcast instructions, use the scalar size as the
45401 // element type for the shuffle. Then cast to the wider element type. The
45402 // widened bits won't be used, and this might allow the use of a broadcast
45403 // load.
45404 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
45405 unsigned Scale = EltSizeInBits / NumElts;
45406 EVT BroadcastVT =
45407 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
45408 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
45409 ShuffleMask.append(NumElts * Scale, 0);
45410 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
45411 Vec = DAG.getBitcast(VT, Vec);
45412 } else {
45413 // For smaller scalar integers, we can simply any-extend it to the vector
45414 // element size (we don't care about the upper bits) and broadcast it to all
45415 // elements.
45416 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
45417 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
45418 ShuffleMask.append(NumElts, 0);
45419 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
45420 }
45421
45422 // Now, mask the relevant bit in each element.
45424 for (unsigned i = 0; i != NumElts; ++i) {
45425 int BitIdx = (i % EltSizeInBits);
45426 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
45427 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
45428 }
45429 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
45430 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
45431
45432 // Compare against the bitmask and extend the result.
45433 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
45434 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
45435 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
45436
45437 // For SEXT, this is now done, otherwise shift the result down for
45438 // zero-extension.
45439 if (Opcode == ISD::SIGN_EXTEND)
45440 return Vec;
45441 return DAG.getNode(ISD::SRL, DL, VT, Vec,
45442 DAG.getConstant(EltSizeInBits - 1, DL, VT));
45443}
45444
45445/// If a vector select has an operand that is -1 or 0, try to simplify the
45446/// select to a bitwise logic operation.
45447/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
45448static SDValue
45451 const X86Subtarget &Subtarget) {
45452 SDValue Cond = N->getOperand(0);
45453 SDValue LHS = N->getOperand(1);
45454 SDValue RHS = N->getOperand(2);
45455 EVT VT = LHS.getValueType();
45456 EVT CondVT = Cond.getValueType();
45457 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45458
45459 if (N->getOpcode() != ISD::VSELECT)
45460 return SDValue();
45461
45462 assert(CondVT.isVector() && "Vector select expects a vector selector!");
45463
45464 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
45465 // TODO: Can we assert that both operands are not zeros (because that should
45466 // get simplified at node creation time)?
45467 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
45468 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
45469
45470 // If both inputs are 0/undef, create a complete zero vector.
45471 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
45472 if (TValIsAllZeros && FValIsAllZeros) {
45473 if (VT.isFloatingPoint())
45474 return DAG.getConstantFP(0.0, DL, VT);
45475 return DAG.getConstant(0, DL, VT);
45476 }
45477
45478 // To use the condition operand as a bitwise mask, it must have elements that
45479 // are the same size as the select elements. Ie, the condition operand must
45480 // have already been promoted from the IR select condition type <N x i1>.
45481 // Don't check if the types themselves are equal because that excludes
45482 // vector floating-point selects.
45483 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
45484 return SDValue();
45485
45486 // Try to invert the condition if true value is not all 1s and false value is
45487 // not all 0s. Only do this if the condition has one use.
45488 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
45489 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
45490 // Check if the selector will be produced by CMPP*/PCMP*.
45491 Cond.getOpcode() == ISD::SETCC &&
45492 // Check if SETCC has already been promoted.
45493 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
45494 CondVT) {
45495 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
45496
45497 if (TValIsAllZeros || FValIsAllOnes) {
45498 SDValue CC = Cond.getOperand(2);
45500 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
45501 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
45502 NewCC);
45503 std::swap(LHS, RHS);
45504 TValIsAllOnes = FValIsAllOnes;
45505 FValIsAllZeros = TValIsAllZeros;
45506 }
45507 }
45508
45509 // Cond value must be 'sign splat' to be converted to a logical op.
45510 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
45511 return SDValue();
45512
45513 // vselect Cond, 111..., 000... -> Cond
45514 if (TValIsAllOnes && FValIsAllZeros)
45515 return DAG.getBitcast(VT, Cond);
45516
45517 if (!TLI.isTypeLegal(CondVT))
45518 return SDValue();
45519
45520 // vselect Cond, 111..., X -> or Cond, X
45521 if (TValIsAllOnes) {
45522 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
45523 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
45524 return DAG.getBitcast(VT, Or);
45525 }
45526
45527 // vselect Cond, X, 000... -> and Cond, X
45528 if (FValIsAllZeros) {
45529 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
45530 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
45531 return DAG.getBitcast(VT, And);
45532 }
45533
45534 // vselect Cond, 000..., X -> andn Cond, X
45535 if (TValIsAllZeros) {
45536 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
45537 SDValue AndN;
45538 // The canonical form differs for i1 vectors - x86andnp is not used
45539 if (CondVT.getScalarType() == MVT::i1)
45540 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
45541 CastRHS);
45542 else
45543 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
45544 return DAG.getBitcast(VT, AndN);
45545 }
45546
45547 return SDValue();
45548}
45549
45550/// If both arms of a vector select are concatenated vectors, split the select,
45551/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
45552/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
45553/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
45555 const X86Subtarget &Subtarget) {
45556 unsigned Opcode = N->getOpcode();
45557 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
45558 return SDValue();
45559
45560 // TODO: Split 512-bit vectors too?
45561 EVT VT = N->getValueType(0);
45562 if (!VT.is256BitVector())
45563 return SDValue();
45564
45565 // TODO: Split as long as any 2 of the 3 operands are concatenated?
45566 SDValue Cond = N->getOperand(0);
45567 SDValue TVal = N->getOperand(1);
45568 SDValue FVal = N->getOperand(2);
45569 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
45570 !isFreeToSplitVector(TVal.getNode(), DAG) ||
45571 !isFreeToSplitVector(FVal.getNode(), DAG))
45572 return SDValue();
45573
45574 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
45575 ArrayRef<SDValue> Ops) {
45576 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
45577 };
45578 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
45579 /*CheckBWI*/ false);
45580}
45581
45583 const SDLoc &DL) {
45584 SDValue Cond = N->getOperand(0);
45585 SDValue LHS = N->getOperand(1);
45586 SDValue RHS = N->getOperand(2);
45587
45588 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
45589 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
45590 if (!TrueC || !FalseC)
45591 return SDValue();
45592
45593 // Don't do this for crazy integer types.
45594 EVT VT = N->getValueType(0);
45595 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
45596 return SDValue();
45597
45598 // We're going to use the condition bit in math or logic ops. We could allow
45599 // this with a wider condition value (post-legalization it becomes an i8),
45600 // but if nothing is creating selects that late, it doesn't matter.
45601 if (Cond.getValueType() != MVT::i1)
45602 return SDValue();
45603
45604 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
45605 // 3, 5, or 9 with i32/i64, so those get transformed too.
45606 // TODO: For constants that overflow or do not differ by power-of-2 or small
45607 // multiplier, convert to 'and' + 'add'.
45608 const APInt &TrueVal = TrueC->getAPIntValue();
45609 const APInt &FalseVal = FalseC->getAPIntValue();
45610
45611 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
45612 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
45613 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
45614 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45615 if (CC == ISD::SETEQ || CC == ISD::SETNE)
45616 return SDValue();
45617 }
45618
45619 bool OV;
45620 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
45621 if (OV)
45622 return SDValue();
45623
45624 APInt AbsDiff = Diff.abs();
45625 if (AbsDiff.isPowerOf2() ||
45626 ((VT == MVT::i32 || VT == MVT::i64) &&
45627 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
45628
45629 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
45630 // of the condition can usually be folded into a compare predicate, but even
45631 // without that, the sequence should be cheaper than a CMOV alternative.
45632 if (TrueVal.slt(FalseVal)) {
45633 Cond = DAG.getNOT(DL, Cond, MVT::i1);
45634 std::swap(TrueC, FalseC);
45635 }
45636
45637 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
45638 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
45639
45640 // Multiply condition by the difference if non-one.
45641 if (!AbsDiff.isOne())
45642 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
45643
45644 // Add the base if non-zero.
45645 if (!FalseC->isZero())
45646 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
45647
45648 return R;
45649 }
45650
45651 return SDValue();
45652}
45653
45654/// If this is a *dynamic* select (non-constant condition) and we can match
45655/// this node with one of the variable blend instructions, restructure the
45656/// condition so that blends can use the high (sign) bit of each element.
45657/// This function will also call SimplifyDemandedBits on already created
45658/// BLENDV to perform additional simplifications.
45660 const SDLoc &DL,
45662 const X86Subtarget &Subtarget) {
45663 SDValue Cond = N->getOperand(0);
45664 if ((N->getOpcode() != ISD::VSELECT &&
45665 N->getOpcode() != X86ISD::BLENDV) ||
45667 return SDValue();
45668
45669 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45670 unsigned BitWidth = Cond.getScalarValueSizeInBits();
45671 EVT VT = N->getValueType(0);
45672
45673 // We can only handle the cases where VSELECT is directly legal on the
45674 // subtarget. We custom lower VSELECT nodes with constant conditions and
45675 // this makes it hard to see whether a dynamic VSELECT will correctly
45676 // lower, so we both check the operation's status and explicitly handle the
45677 // cases where a *dynamic* blend will fail even though a constant-condition
45678 // blend could be custom lowered.
45679 // FIXME: We should find a better way to handle this class of problems.
45680 // Potentially, we should combine constant-condition vselect nodes
45681 // pre-legalization into shuffles and not mark as many types as custom
45682 // lowered.
45684 return SDValue();
45685 // FIXME: We don't support i16-element blends currently. We could and
45686 // should support them by making *all* the bits in the condition be set
45687 // rather than just the high bit and using an i8-element blend.
45688 if (VT.getVectorElementType() == MVT::i16)
45689 return SDValue();
45690 // Dynamic blending was only available from SSE4.1 onward.
45691 if (VT.is128BitVector() && !Subtarget.hasSSE41())
45692 return SDValue();
45693 // Byte blends are only available in AVX2
45694 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
45695 return SDValue();
45696 // There are no 512-bit blend instructions that use sign bits.
45697 if (VT.is512BitVector())
45698 return SDValue();
45699
45700 // Don't optimize before the condition has been transformed to a legal type
45701 // and don't ever optimize vector selects that map to AVX512 mask-registers.
45702 if (BitWidth < 8 || BitWidth > 64)
45703 return SDValue();
45704
45705 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
45706 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
45707 UI != UE; ++UI)
45708 if ((UI->getOpcode() != ISD::VSELECT &&
45709 UI->getOpcode() != X86ISD::BLENDV) ||
45710 UI.getOperandNo() != 0)
45711 return false;
45712
45713 return true;
45714 };
45715
45717
45718 if (OnlyUsedAsSelectCond(Cond)) {
45719 KnownBits Known;
45721 !DCI.isBeforeLegalizeOps());
45722 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
45723 return SDValue();
45724
45725 // If we changed the computation somewhere in the DAG, this change will
45726 // affect all users of Cond. Update all the nodes so that we do not use
45727 // the generic VSELECT anymore. Otherwise, we may perform wrong
45728 // optimizations as we messed with the actual expectation for the vector
45729 // boolean values.
45730 for (SDNode *U : Cond->uses()) {
45731 if (U->getOpcode() == X86ISD::BLENDV)
45732 continue;
45733
45734 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
45735 Cond, U->getOperand(1), U->getOperand(2));
45736 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
45737 DCI.AddToWorklist(U);
45738 }
45739 DCI.CommitTargetLoweringOpt(TLO);
45740 return SDValue(N, 0);
45741 }
45742
45743 // Otherwise we can still at least try to simplify multiple use bits.
45745 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
45746 N->getOperand(1), N->getOperand(2));
45747
45748 return SDValue();
45749}
45750
45751// Try to match:
45752// (or (and (M, (sub 0, X)), (pandn M, X)))
45753// which is a special case of:
45754// (select M, (sub 0, X), X)
45755// Per:
45756// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
45757// We know that, if fNegate is 0 or 1:
45758// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
45759//
45760// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
45761// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
45762// ( M ? -X : X) == ((X ^ M ) + (M & 1))
45763// This lets us transform our vselect to:
45764// (add (xor X, M), (and M, 1))
45765// And further to:
45766// (sub (xor X, M), M)
45768 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
45769 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
45770 EVT MaskVT = Mask.getValueType();
45771 assert(MaskVT.isInteger() &&
45772 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
45773 "Mask must be zero/all-bits");
45774
45775 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
45776 return SDValue();
45778 return SDValue();
45779
45780 auto IsNegV = [](SDNode *N, SDValue V) {
45781 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
45782 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
45783 };
45784
45785 SDValue V;
45786 if (IsNegV(Y.getNode(), X))
45787 V = X;
45788 else if (IsNegV(X.getNode(), Y))
45789 V = Y;
45790 else
45791 return SDValue();
45792
45793 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
45794 SDValue SubOp2 = Mask;
45795
45796 // If the negate was on the false side of the select, then
45797 // the operands of the SUB need to be swapped. PR 27251.
45798 // This is because the pattern being matched above is
45799 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
45800 // but if the pattern matched was
45801 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
45802 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
45803 // pattern also needs to be a negation of the replacement pattern above.
45804 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
45805 // sub accomplishes the negation of the replacement pattern.
45806 if (V == Y)
45807 std::swap(SubOp1, SubOp2);
45808
45809 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
45810 return DAG.getBitcast(VT, Res);
45811}
45812
45814 const X86Subtarget &Subtarget) {
45815 if (!Subtarget.hasAVX512())
45816 return SDValue();
45817 if (N->getOpcode() != ISD::VSELECT)
45818 return SDValue();
45819
45820 SDValue Cond = N->getOperand(0);
45821 SDValue LHS = N->getOperand(1);
45822 SDValue RHS = N->getOperand(2);
45823
45824 if (canCombineAsMaskOperation(LHS, Subtarget))
45825 return SDValue();
45826
45827 if (!canCombineAsMaskOperation(RHS, Subtarget))
45828 return SDValue();
45829
45830 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
45831 return SDValue();
45832
45833 // Commute LHS and RHS to create opportunity to select mask instruction.
45834 // (vselect M, L, R) -> (vselect ~M, R, L)
45835 ISD::CondCode NewCC =
45836 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
45837 Cond.getOperand(0).getValueType());
45838 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
45839 Cond.getOperand(1), NewCC);
45840 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
45841}
45842
45843/// Do target-specific dag combines on SELECT and VSELECT nodes.
45846 const X86Subtarget &Subtarget) {
45847 SDLoc DL(N);
45848 SDValue Cond = N->getOperand(0);
45849 SDValue LHS = N->getOperand(1);
45850 SDValue RHS = N->getOperand(2);
45851
45852 // Try simplification again because we use this function to optimize
45853 // BLENDV nodes that are not handled by the generic combiner.
45854 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
45855 return V;
45856
45857 // When avx512 is available the lhs operand of select instruction can be
45858 // folded with mask instruction, while the rhs operand can't. Commute the
45859 // lhs and rhs of the select instruction to create the opportunity of
45860 // folding.
45861 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
45862 return V;
45863
45864 EVT VT = LHS.getValueType();
45865 EVT CondVT = Cond.getValueType();
45866 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45867 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
45868
45869 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
45870 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
45871 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
45872 if (CondVT.isVector() && CondVT.isInteger() &&
45873 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
45874 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
45877 DL, DAG, Subtarget))
45878 return V;
45879
45880 // Convert vselects with constant condition into shuffles.
45881 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
45882 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
45885 N->getOpcode() == X86ISD::BLENDV))
45886 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
45887 }
45888
45889 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
45890 // by forcing the unselected elements to zero.
45891 // TODO: Can we handle more shuffles with this?
45892 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
45893 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
45894 LHS.hasOneUse() && RHS.hasOneUse()) {
45895 MVT SimpleVT = VT.getSimpleVT();
45896 SmallVector<SDValue, 1> LHSOps, RHSOps;
45897 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
45898 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
45899 getTargetShuffleMask(LHS, true, LHSOps, LHSMask) &&
45900 getTargetShuffleMask(RHS, true, RHSOps, RHSMask)) {
45901 int NumElts = VT.getVectorNumElements();
45902 for (int i = 0; i != NumElts; ++i) {
45903 // getConstVector sets negative shuffle mask values as undef, so ensure
45904 // we hardcode SM_SentinelZero values to zero (0x80).
45905 if (CondMask[i] < NumElts) {
45906 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
45907 RHSMask[i] = 0x80;
45908 } else {
45909 LHSMask[i] = 0x80;
45910 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
45911 }
45912 }
45913 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
45914 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
45915 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
45916 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
45917 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
45918 }
45919 }
45920
45921 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
45922 // instructions match the semantics of the common C idiom x<y?x:y but not
45923 // x<=y?x:y, because of how they handle negative zero (which can be
45924 // ignored in unsafe-math mode).
45925 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
45926 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
45927 VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
45928 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
45929 (Subtarget.hasSSE2() ||
45930 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
45931 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45932
45933 unsigned Opcode = 0;
45934 // Check for x CC y ? x : y.
45935 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
45936 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
45937 switch (CC) {
45938 default: break;
45939 case ISD::SETULT:
45940 // Converting this to a min would handle NaNs incorrectly, and swapping
45941 // the operands would cause it to handle comparisons between positive
45942 // and negative zero incorrectly.
45943 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45945 !(DAG.isKnownNeverZeroFloat(LHS) ||
45947 break;
45948 std::swap(LHS, RHS);
45949 }
45950 Opcode = X86ISD::FMIN;
45951 break;
45952 case ISD::SETOLE:
45953 // Converting this to a min would handle comparisons between positive
45954 // and negative zero incorrectly.
45957 break;
45958 Opcode = X86ISD::FMIN;
45959 break;
45960 case ISD::SETULE:
45961 // Converting this to a min would handle both negative zeros and NaNs
45962 // incorrectly, but we can swap the operands to fix both.
45963 std::swap(LHS, RHS);
45964 [[fallthrough]];
45965 case ISD::SETOLT:
45966 case ISD::SETLT:
45967 case ISD::SETLE:
45968 Opcode = X86ISD::FMIN;
45969 break;
45970
45971 case ISD::SETOGE:
45972 // Converting this to a max would handle comparisons between positive
45973 // and negative zero incorrectly.
45976 break;
45977 Opcode = X86ISD::FMAX;
45978 break;
45979 case ISD::SETUGT:
45980 // Converting this to a max would handle NaNs incorrectly, and swapping
45981 // the operands would cause it to handle comparisons between positive
45982 // and negative zero incorrectly.
45983 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45985 !(DAG.isKnownNeverZeroFloat(LHS) ||
45987 break;
45988 std::swap(LHS, RHS);
45989 }
45990 Opcode = X86ISD::FMAX;
45991 break;
45992 case ISD::SETUGE:
45993 // Converting this to a max would handle both negative zeros and NaNs
45994 // incorrectly, but we can swap the operands to fix both.
45995 std::swap(LHS, RHS);
45996 [[fallthrough]];
45997 case ISD::SETOGT:
45998 case ISD::SETGT:
45999 case ISD::SETGE:
46000 Opcode = X86ISD::FMAX;
46001 break;
46002 }
46003 // Check for x CC y ? y : x -- a min/max with reversed arms.
46004 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
46005 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
46006 switch (CC) {
46007 default: break;
46008 case ISD::SETOGE:
46009 // Converting this to a min would handle comparisons between positive
46010 // and negative zero incorrectly, and swapping the operands would
46011 // cause it to handle NaNs incorrectly.
46013 !(DAG.isKnownNeverZeroFloat(LHS) ||
46014 DAG.isKnownNeverZeroFloat(RHS))) {
46015 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46016 break;
46017 std::swap(LHS, RHS);
46018 }
46019 Opcode = X86ISD::FMIN;
46020 break;
46021 case ISD::SETUGT:
46022 // Converting this to a min would handle NaNs incorrectly.
46023 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46024 break;
46025 Opcode = X86ISD::FMIN;
46026 break;
46027 case ISD::SETUGE:
46028 // Converting this to a min would handle both negative zeros and NaNs
46029 // incorrectly, but we can swap the operands to fix both.
46030 std::swap(LHS, RHS);
46031 [[fallthrough]];
46032 case ISD::SETOGT:
46033 case ISD::SETGT:
46034 case ISD::SETGE:
46035 Opcode = X86ISD::FMIN;
46036 break;
46037
46038 case ISD::SETULT:
46039 // Converting this to a max would handle NaNs incorrectly.
46040 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46041 break;
46042 Opcode = X86ISD::FMAX;
46043 break;
46044 case ISD::SETOLE:
46045 // Converting this to a max would handle comparisons between positive
46046 // and negative zero incorrectly, and swapping the operands would
46047 // cause it to handle NaNs incorrectly.
46049 !DAG.isKnownNeverZeroFloat(LHS) &&
46050 !DAG.isKnownNeverZeroFloat(RHS)) {
46051 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
46052 break;
46053 std::swap(LHS, RHS);
46054 }
46055 Opcode = X86ISD::FMAX;
46056 break;
46057 case ISD::SETULE:
46058 // Converting this to a max would handle both negative zeros and NaNs
46059 // incorrectly, but we can swap the operands to fix both.
46060 std::swap(LHS, RHS);
46061 [[fallthrough]];
46062 case ISD::SETOLT:
46063 case ISD::SETLT:
46064 case ISD::SETLE:
46065 Opcode = X86ISD::FMAX;
46066 break;
46067 }
46068 }
46069
46070 if (Opcode)
46071 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
46072 }
46073
46074 // Some mask scalar intrinsics rely on checking if only one bit is set
46075 // and implement it in C code like this:
46076 // A[0] = (U & 1) ? A[0] : W[0];
46077 // This creates some redundant instructions that break pattern matching.
46078 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
46079 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
46080 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
46081 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46082 SDValue AndNode = Cond.getOperand(0);
46083 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
46084 isNullConstant(Cond.getOperand(1)) &&
46085 isOneConstant(AndNode.getOperand(1))) {
46086 // LHS and RHS swapped due to
46087 // setcc outputting 1 when AND resulted in 0 and vice versa.
46088 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
46089 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
46090 }
46091 }
46092
46093 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
46094 // lowering on KNL. In this case we convert it to
46095 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
46096 // The same situation all vectors of i8 and i16 without BWI.
46097 // Make sure we extend these even before type legalization gets a chance to
46098 // split wide vectors.
46099 // Since SKX these selects have a proper lowering.
46100 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
46101 CondVT.getVectorElementType() == MVT::i1 &&
46102 (VT.getVectorElementType() == MVT::i8 ||
46103 VT.getVectorElementType() == MVT::i16)) {
46104 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
46105 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
46106 }
46107
46108 // AVX512 - Extend select with zero to merge with target shuffle.
46109 // select(mask, extract_subvector(shuffle(x)), zero) -->
46110 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
46111 // TODO - support non target shuffles as well.
46112 if (Subtarget.hasAVX512() && CondVT.isVector() &&
46113 CondVT.getVectorElementType() == MVT::i1) {
46114 auto SelectableOp = [&TLI](SDValue Op) {
46115 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46116 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
46117 isNullConstant(Op.getOperand(1)) &&
46118 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
46119 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
46120 };
46121
46122 bool SelectableLHS = SelectableOp(LHS);
46123 bool SelectableRHS = SelectableOp(RHS);
46124 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
46125 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
46126
46127 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
46128 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
46129 : RHS.getOperand(0).getValueType();
46130 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
46131 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
46132 VT.getSizeInBits());
46133 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
46134 VT.getSizeInBits());
46135 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
46136 DAG.getUNDEF(SrcCondVT), Cond,
46137 DAG.getIntPtrConstant(0, DL));
46138 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
46139 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
46140 }
46141 }
46142
46143 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
46144 return V;
46145
46146 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
46147 Cond.hasOneUse()) {
46148 EVT CondVT = Cond.getValueType();
46149 SDValue Cond0 = Cond.getOperand(0);
46150 SDValue Cond1 = Cond.getOperand(1);
46151 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46152
46153 // Canonicalize min/max:
46154 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
46155 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
46156 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
46157 // the need for an extra compare against zero. e.g.
46158 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
46159 // subl %esi, %edi
46160 // testl %edi, %edi
46161 // movl $0, %eax
46162 // cmovgl %edi, %eax
46163 // =>
46164 // xorl %eax, %eax
46165 // subl %esi, $edi
46166 // cmovsl %eax, %edi
46167 //
46168 // We can also canonicalize
46169 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
46170 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
46171 // This allows the use of a test instruction for the compare.
46172 if (LHS == Cond0 && RHS == Cond1) {
46173 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
46176 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
46177 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
46178 }
46179 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
46180 ISD::CondCode NewCC = ISD::SETUGE;
46181 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
46182 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
46183 }
46184 }
46185
46186 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
46187 // fold eq + gt/lt nested selects into ge/le selects
46188 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
46189 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
46190 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
46191 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
46192 // .. etc ..
46193 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
46194 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
46195 SDValue InnerSetCC = RHS.getOperand(0);
46196 ISD::CondCode InnerCC =
46197 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
46198 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
46199 Cond0 == InnerSetCC.getOperand(0) &&
46200 Cond1 == InnerSetCC.getOperand(1)) {
46201 ISD::CondCode NewCC;
46202 switch (CC == ISD::SETEQ ? InnerCC : CC) {
46203 // clang-format off
46204 case ISD::SETGT: NewCC = ISD::SETGE; break;
46205 case ISD::SETLT: NewCC = ISD::SETLE; break;
46206 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
46207 case ISD::SETULT: NewCC = ISD::SETULE; break;
46208 default: NewCC = ISD::SETCC_INVALID; break;
46209 // clang-format on
46210 }
46211 if (NewCC != ISD::SETCC_INVALID) {
46212 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
46213 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
46214 }
46215 }
46216 }
46217 }
46218
46219 // Check if the first operand is all zeros and Cond type is vXi1.
46220 // If this an avx512 target we can improve the use of zero masking by
46221 // swapping the operands and inverting the condition.
46222 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
46223 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
46224 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
46225 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
46226 // Invert the cond to not(cond) : xor(op,allones)=not(op)
46227 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
46228 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
46229 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
46230 }
46231
46232 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
46233 // get split by legalization.
46234 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
46235 CondVT.getVectorElementType() == MVT::i1 &&
46236 TLI.isTypeLegal(VT.getScalarType())) {
46237 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
46239 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
46240 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
46241 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
46242 }
46243 }
46244
46245 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
46246 // with out-of-bounds clamping.
46247
46248 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
46249 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
46250 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
46251 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
46252 // exceeding bitwidth-1.
46253 if (N->getOpcode() == ISD::VSELECT) {
46254 using namespace llvm::SDPatternMatch;
46255 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
46256 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
46257 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
46258 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
46260 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
46262 m_SpecificCondCode(ISD::SETULT)))) {
46263 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
46264 : X86ISD::VSHLV,
46265 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
46266 }
46267 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
46268 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
46269 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
46270 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
46272 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
46274 m_SpecificCondCode(ISD::SETUGE)))) {
46275 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
46276 : X86ISD::VSHLV,
46277 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
46278 }
46279 }
46280
46281 // Early exit check
46282 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
46283 return SDValue();
46284
46285 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DL, DCI, Subtarget))
46286 return V;
46287
46288 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
46289 return V;
46290
46291 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
46292 return V;
46293
46294 // select(~Cond, X, Y) -> select(Cond, Y, X)
46295 if (CondVT.getScalarType() != MVT::i1) {
46296 if (SDValue CondNot = IsNOT(Cond, DAG))
46297 return DAG.getNode(N->getOpcode(), DL, VT,
46298 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
46299
46300 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
46301 // signbit.
46302 if (Cond.getOpcode() == X86ISD::PCMPGT &&
46303 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
46304 Cond.hasOneUse()) {
46305 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
46306 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
46307 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
46308 }
46309 }
46310
46311 // Try to optimize vXi1 selects if both operands are either all constants or
46312 // bitcasts from scalar integer type. In that case we can convert the operands
46313 // to integer and use an integer select which will be converted to a CMOV.
46314 // We need to take a little bit of care to avoid creating an i64 type after
46315 // type legalization.
46316 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
46317 VT.getVectorElementType() == MVT::i1 &&
46318 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
46320 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
46321 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
46322 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
46323
46324 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
46325 LHS.getOperand(0).getValueType() == IntVT)) &&
46326 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
46327 RHS.getOperand(0).getValueType() == IntVT))) {
46328 if (LHSIsConst)
46330 else
46331 LHS = LHS.getOperand(0);
46332
46333 if (RHSIsConst)
46335 else
46336 RHS = RHS.getOperand(0);
46337
46338 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
46339 return DAG.getBitcast(VT, Select);
46340 }
46341 }
46342 }
46343
46344 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
46345 // single bits, then invert the predicate and swap the select operands.
46346 // This can lower using a vector shift bit-hack rather than mask and compare.
46347 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
46348 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
46349 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
46350 Cond.getOperand(0).getOpcode() == ISD::AND &&
46351 isNullOrNullSplat(Cond.getOperand(1)) &&
46352 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
46353 Cond.getOperand(0).getValueType() == VT) {
46354 // The 'and' mask must be composed of power-of-2 constants.
46355 SDValue And = Cond.getOperand(0);
46356 auto *C = isConstOrConstSplat(And.getOperand(1));
46357 if (C && C->getAPIntValue().isPowerOf2()) {
46358 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
46359 SDValue NotCond =
46360 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
46361 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
46362 }
46363
46364 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
46365 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
46366 // 16-bit lacks a proper blendv.
46367 unsigned EltBitWidth = VT.getScalarSizeInBits();
46368 bool CanShiftBlend =
46369 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
46370 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
46371 (Subtarget.hasXOP()));
46372 if (CanShiftBlend &&
46373 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
46374 return C->getAPIntValue().isPowerOf2();
46375 })) {
46376 // Create a left-shift constant to get the mask bits over to the sign-bit.
46377 SDValue Mask = And.getOperand(1);
46378 SmallVector<int, 32> ShlVals;
46379 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
46380 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
46381 ShlVals.push_back(EltBitWidth - 1 -
46382 MaskVal->getAPIntValue().exactLogBase2());
46383 }
46384 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
46385 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
46386 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
46387 SDValue NewCond =
46388 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
46389 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
46390 }
46391 }
46392
46393 return SDValue();
46394}
46395
46396/// Combine:
46397/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
46398/// to:
46399/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
46400/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
46401/// Note that this is only legal for some op/cc combinations.
46403 SelectionDAG &DAG,
46404 const X86Subtarget &Subtarget) {
46405 // This combine only operates on CMP-like nodes.
46406 if (!(Cmp.getOpcode() == X86ISD::CMP ||
46407 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
46408 return SDValue();
46409
46410 // Can't replace the cmp if it has more uses than the one we're looking at.
46411 // FIXME: We would like to be able to handle this, but would need to make sure
46412 // all uses were updated.
46413 if (!Cmp.hasOneUse())
46414 return SDValue();
46415
46416 // This only applies to variations of the common case:
46417 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
46418 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
46419 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
46420 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
46421 // Using the proper condcodes (see below), overflow is checked for.
46422
46423 // FIXME: We can generalize both constraints:
46424 // - XOR/OR/AND (if they were made to survive AtomicExpand)
46425 // - LHS != 1
46426 // if the result is compared.
46427
46428 SDValue CmpLHS = Cmp.getOperand(0);
46429 SDValue CmpRHS = Cmp.getOperand(1);
46430 EVT CmpVT = CmpLHS.getValueType();
46431
46432 if (!CmpLHS.hasOneUse())
46433 return SDValue();
46434
46435 unsigned Opc = CmpLHS.getOpcode();
46436 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
46437 return SDValue();
46438
46439 SDValue OpRHS = CmpLHS.getOperand(2);
46440 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
46441 if (!OpRHSC)
46442 return SDValue();
46443
46444 APInt Addend = OpRHSC->getAPIntValue();
46445 if (Opc == ISD::ATOMIC_LOAD_SUB)
46446 Addend = -Addend;
46447
46448 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
46449 if (!CmpRHSC)
46450 return SDValue();
46451
46452 APInt Comparison = CmpRHSC->getAPIntValue();
46453 APInt NegAddend = -Addend;
46454
46455 // See if we can adjust the CC to make the comparison match the negated
46456 // addend.
46457 if (Comparison != NegAddend) {
46458 APInt IncComparison = Comparison + 1;
46459 if (IncComparison == NegAddend) {
46460 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
46461 Comparison = IncComparison;
46462 CC = X86::COND_AE;
46463 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
46464 Comparison = IncComparison;
46465 CC = X86::COND_L;
46466 }
46467 }
46468 APInt DecComparison = Comparison - 1;
46469 if (DecComparison == NegAddend) {
46470 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
46471 Comparison = DecComparison;
46472 CC = X86::COND_A;
46473 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
46474 Comparison = DecComparison;
46475 CC = X86::COND_LE;
46476 }
46477 }
46478 }
46479
46480 // If the addend is the negation of the comparison value, then we can do
46481 // a full comparison by emitting the atomic arithmetic as a locked sub.
46482 if (Comparison == NegAddend) {
46483 // The CC is fine, but we need to rewrite the LHS of the comparison as an
46484 // atomic sub.
46485 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
46486 auto AtomicSub = DAG.getAtomic(
46487 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
46488 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
46489 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
46490 AN->getMemOperand());
46491 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
46492 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
46493 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
46494 return LockOp;
46495 }
46496
46497 // We can handle comparisons with zero in a number of cases by manipulating
46498 // the CC used.
46499 if (!Comparison.isZero())
46500 return SDValue();
46501
46502 if (CC == X86::COND_S && Addend == 1)
46503 CC = X86::COND_LE;
46504 else if (CC == X86::COND_NS && Addend == 1)
46505 CC = X86::COND_G;
46506 else if (CC == X86::COND_G && Addend == -1)
46507 CC = X86::COND_GE;
46508 else if (CC == X86::COND_LE && Addend == -1)
46509 CC = X86::COND_L;
46510 else
46511 return SDValue();
46512
46513 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
46514 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
46515 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
46516 return LockOp;
46517}
46518
46519// Check whether we're just testing the signbit, and whether we can simplify
46520// this by tracking where the signbit came from.
46522 SelectionDAG &DAG) {
46523 if (CC != X86::COND_S && CC != X86::COND_NS)
46524 return SDValue();
46525
46526 if (!Cmp.hasOneUse())
46527 return SDValue();
46528
46529 SDValue Src;
46530 if (Cmp.getOpcode() == X86ISD::CMP) {
46531 // CMP(X,0) -> signbit test
46532 if (!isNullConstant(Cmp.getOperand(1)))
46533 return SDValue();
46534 Src = Cmp.getOperand(0);
46535 // Peek through a SRA node as we just need the signbit.
46536 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
46537 // TODO: Use SimplifyDemandedBits instead of just SRA?
46538 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
46539 return SDValue();
46540 Src = Src.getOperand(0);
46541 } else if (Cmp.getOpcode() == X86ISD::OR) {
46542 // OR(X,Y) -> see if only one operand contributes to the signbit.
46543 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
46544 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
46545 Src = Cmp.getOperand(1);
46546 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
46547 Src = Cmp.getOperand(0);
46548 else
46549 return SDValue();
46550 } else {
46551 return SDValue();
46552 }
46553
46554 // Replace with a TEST on the MSB.
46555 SDLoc DL(Cmp);
46556 MVT SrcVT = Src.getSimpleValueType();
46557 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
46558
46559 // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
46560 // peek through and adjust the TEST bit.
46561 if (Src.getOpcode() == ISD::SHL) {
46562 if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) {
46563 Src = Src.getOperand(0);
46564 BitMask.lshrInPlace(*ShiftAmt);
46565 }
46566 }
46567
46568 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
46569 DAG.getConstant(BitMask, DL, SrcVT));
46571 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
46572 DAG.getConstant(0, DL, SrcVT));
46573}
46574
46575// Check whether a boolean test is testing a boolean value generated by
46576// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
46577// code.
46578//
46579// Simplify the following patterns:
46580// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
46581// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
46582// to (Op EFLAGS Cond)
46583//
46584// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
46585// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
46586// to (Op EFLAGS !Cond)
46587//
46588// where Op could be BRCOND or CMOV.
46589//
46591 // This combine only operates on CMP-like nodes.
46592 if (!(Cmp.getOpcode() == X86ISD::CMP ||
46593 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
46594 return SDValue();
46595
46596 // Quit if not used as a boolean value.
46597 if (CC != X86::COND_E && CC != X86::COND_NE)
46598 return SDValue();
46599
46600 // Check CMP operands. One of them should be 0 or 1 and the other should be
46601 // an SetCC or extended from it.
46602 SDValue Op1 = Cmp.getOperand(0);
46603 SDValue Op2 = Cmp.getOperand(1);
46604
46605 SDValue SetCC;
46606 const ConstantSDNode* C = nullptr;
46607 bool needOppositeCond = (CC == X86::COND_E);
46608 bool checkAgainstTrue = false; // Is it a comparison against 1?
46609
46610 if ((C = dyn_cast<ConstantSDNode>(Op1)))
46611 SetCC = Op2;
46612 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
46613 SetCC = Op1;
46614 else // Quit if all operands are not constants.
46615 return SDValue();
46616
46617 if (C->getZExtValue() == 1) {
46618 needOppositeCond = !needOppositeCond;
46619 checkAgainstTrue = true;
46620 } else if (C->getZExtValue() != 0)
46621 // Quit if the constant is neither 0 or 1.
46622 return SDValue();
46623
46624 bool truncatedToBoolWithAnd = false;
46625 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
46626 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
46627 SetCC.getOpcode() == ISD::TRUNCATE ||
46628 SetCC.getOpcode() == ISD::AND) {
46629 if (SetCC.getOpcode() == ISD::AND) {
46630 int OpIdx = -1;
46631 if (isOneConstant(SetCC.getOperand(0)))
46632 OpIdx = 1;
46633 if (isOneConstant(SetCC.getOperand(1)))
46634 OpIdx = 0;
46635 if (OpIdx < 0)
46636 break;
46637 SetCC = SetCC.getOperand(OpIdx);
46638 truncatedToBoolWithAnd = true;
46639 } else
46640 SetCC = SetCC.getOperand(0);
46641 }
46642
46643 switch (SetCC.getOpcode()) {
46645 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
46646 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
46647 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
46648 // truncated to i1 using 'and'.
46649 if (checkAgainstTrue && !truncatedToBoolWithAnd)
46650 break;
46652 "Invalid use of SETCC_CARRY!");
46653 [[fallthrough]];
46654 case X86ISD::SETCC:
46655 // Set the condition code or opposite one if necessary.
46657 if (needOppositeCond)
46659 return SetCC.getOperand(1);
46660 case X86ISD::CMOV: {
46661 // Check whether false/true value has canonical one, i.e. 0 or 1.
46662 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
46663 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
46664 // Quit if true value is not a constant.
46665 if (!TVal)
46666 return SDValue();
46667 // Quit if false value is not a constant.
46668 if (!FVal) {
46669 SDValue Op = SetCC.getOperand(0);
46670 // Skip 'zext' or 'trunc' node.
46671 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
46672 Op.getOpcode() == ISD::TRUNCATE)
46673 Op = Op.getOperand(0);
46674 // A special case for rdrand/rdseed, where 0 is set if false cond is
46675 // found.
46676 if ((Op.getOpcode() != X86ISD::RDRAND &&
46677 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
46678 return SDValue();
46679 }
46680 // Quit if false value is not the constant 0 or 1.
46681 bool FValIsFalse = true;
46682 if (FVal && FVal->getZExtValue() != 0) {
46683 if (FVal->getZExtValue() != 1)
46684 return SDValue();
46685 // If FVal is 1, opposite cond is needed.
46686 needOppositeCond = !needOppositeCond;
46687 FValIsFalse = false;
46688 }
46689 // Quit if TVal is not the constant opposite of FVal.
46690 if (FValIsFalse && TVal->getZExtValue() != 1)
46691 return SDValue();
46692 if (!FValIsFalse && TVal->getZExtValue() != 0)
46693 return SDValue();
46695 if (needOppositeCond)
46697 return SetCC.getOperand(3);
46698 }
46699 }
46700
46701 return SDValue();
46702}
46703
46704/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
46705/// Match:
46706/// (X86or (X86setcc) (X86setcc))
46707/// (X86cmp (and (X86setcc) (X86setcc)), 0)
46709 X86::CondCode &CC1, SDValue &Flags,
46710 bool &isAnd) {
46711 if (Cond->getOpcode() == X86ISD::CMP) {
46712 if (!isNullConstant(Cond->getOperand(1)))
46713 return false;
46714
46715 Cond = Cond->getOperand(0);
46716 }
46717
46718 isAnd = false;
46719
46720 SDValue SetCC0, SetCC1;
46721 switch (Cond->getOpcode()) {
46722 default: return false;
46723 case ISD::AND:
46724 case X86ISD::AND:
46725 isAnd = true;
46726 [[fallthrough]];
46727 case ISD::OR:
46728 case X86ISD::OR:
46729 SetCC0 = Cond->getOperand(0);
46730 SetCC1 = Cond->getOperand(1);
46731 break;
46732 };
46733
46734 // Make sure we have SETCC nodes, using the same flags value.
46735 if (SetCC0.getOpcode() != X86ISD::SETCC ||
46736 SetCC1.getOpcode() != X86ISD::SETCC ||
46737 SetCC0->getOperand(1) != SetCC1->getOperand(1))
46738 return false;
46739
46740 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
46741 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
46742 Flags = SetCC0->getOperand(1);
46743 return true;
46744}
46745
46746// When legalizing carry, we create carries via add X, -1
46747// If that comes from an actual carry, via setcc, we use the
46748// carry directly.
46750 if (EFLAGS.getOpcode() == X86ISD::ADD) {
46751 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
46752 bool FoundAndLSB = false;
46753 SDValue Carry = EFLAGS.getOperand(0);
46754 while (Carry.getOpcode() == ISD::TRUNCATE ||
46755 Carry.getOpcode() == ISD::ZERO_EXTEND ||
46756 (Carry.getOpcode() == ISD::AND &&
46757 isOneConstant(Carry.getOperand(1)))) {
46758 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
46759 Carry = Carry.getOperand(0);
46760 }
46761 if (Carry.getOpcode() == X86ISD::SETCC ||
46762 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
46763 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
46764 uint64_t CarryCC = Carry.getConstantOperandVal(0);
46765 SDValue CarryOp1 = Carry.getOperand(1);
46766 if (CarryCC == X86::COND_B)
46767 return CarryOp1;
46768 if (CarryCC == X86::COND_A) {
46769 // Try to convert COND_A into COND_B in an attempt to facilitate
46770 // materializing "setb reg".
46771 //
46772 // Do not flip "e > c", where "c" is a constant, because Cmp
46773 // instruction cannot take an immediate as its first operand.
46774 //
46775 if (CarryOp1.getOpcode() == X86ISD::SUB &&
46776 CarryOp1.getNode()->hasOneUse() &&
46777 CarryOp1.getValueType().isInteger() &&
46778 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
46779 SDValue SubCommute =
46780 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
46781 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
46782 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
46783 }
46784 }
46785 // If this is a check of the z flag of an add with 1, switch to the
46786 // C flag.
46787 if (CarryCC == X86::COND_E &&
46788 CarryOp1.getOpcode() == X86ISD::ADD &&
46789 isOneConstant(CarryOp1.getOperand(1)))
46790 return CarryOp1;
46791 } else if (FoundAndLSB) {
46792 SDLoc DL(Carry);
46793 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
46794 if (Carry.getOpcode() == ISD::SRL) {
46795 BitNo = Carry.getOperand(1);
46796 Carry = Carry.getOperand(0);
46797 }
46798 return getBT(Carry, BitNo, DL, DAG);
46799 }
46800 }
46801 }
46802
46803 return SDValue();
46804}
46805
46806/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
46807/// to avoid the inversion.
46809 SelectionDAG &DAG,
46810 const X86Subtarget &Subtarget) {
46811 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
46812 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
46813 EFLAGS.getOpcode() != X86ISD::TESTP)
46814 return SDValue();
46815
46816 // PTEST/TESTP sets EFLAGS as:
46817 // TESTZ: ZF = (Op0 & Op1) == 0
46818 // TESTC: CF = (~Op0 & Op1) == 0
46819 // TESTNZC: ZF == 0 && CF == 0
46820 MVT VT = EFLAGS.getSimpleValueType();
46821 SDValue Op0 = EFLAGS.getOperand(0);
46822 SDValue Op1 = EFLAGS.getOperand(1);
46823 MVT OpVT = Op0.getSimpleValueType();
46824 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46825
46826 // TEST*(~X,Y) == TEST*(X,Y)
46827 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
46828 X86::CondCode InvCC;
46829 switch (CC) {
46830 case X86::COND_B:
46831 // testc -> testz.
46832 InvCC = X86::COND_E;
46833 break;
46834 case X86::COND_AE:
46835 // !testc -> !testz.
46836 InvCC = X86::COND_NE;
46837 break;
46838 case X86::COND_E:
46839 // testz -> testc.
46840 InvCC = X86::COND_B;
46841 break;
46842 case X86::COND_NE:
46843 // !testz -> !testc.
46844 InvCC = X86::COND_AE;
46845 break;
46846 case X86::COND_A:
46847 case X86::COND_BE:
46848 // testnzc -> testnzc (no change).
46849 InvCC = CC;
46850 break;
46851 default:
46852 InvCC = X86::COND_INVALID;
46853 break;
46854 }
46855
46856 if (InvCC != X86::COND_INVALID) {
46857 CC = InvCC;
46858 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46859 DAG.getBitcast(OpVT, NotOp0), Op1);
46860 }
46861 }
46862
46863 if (CC == X86::COND_B || CC == X86::COND_AE) {
46864 // TESTC(X,~X) == TESTC(X,-1)
46865 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46866 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
46867 SDLoc DL(EFLAGS);
46868 return DAG.getNode(
46869 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
46870 DAG.getBitcast(OpVT,
46871 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
46872 }
46873 }
46874 }
46875
46876 if (CC == X86::COND_E || CC == X86::COND_NE) {
46877 // TESTZ(X,~Y) == TESTC(Y,X)
46878 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46880 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46881 DAG.getBitcast(OpVT, NotOp1), Op0);
46882 }
46883
46884 if (Op0 == Op1) {
46885 SDValue BC = peekThroughBitcasts(Op0);
46886 EVT BCVT = BC.getValueType();
46887
46888 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
46889 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
46890 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46891 DAG.getBitcast(OpVT, BC.getOperand(0)),
46892 DAG.getBitcast(OpVT, BC.getOperand(1)));
46893 }
46894
46895 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
46896 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
46898 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46899 DAG.getBitcast(OpVT, BC.getOperand(0)),
46900 DAG.getBitcast(OpVT, BC.getOperand(1)));
46901 }
46902
46903 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
46904 // to more efficiently extract the sign bits and compare that.
46905 // TODO: Handle TESTC with comparison inversion.
46906 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
46907 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
46908 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
46909 unsigned EltBits = BCVT.getScalarSizeInBits();
46910 if (DAG.ComputeNumSignBits(BC) == EltBits) {
46911 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
46912 APInt SignMask = APInt::getSignMask(EltBits);
46913 if (SDValue Res =
46914 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
46915 // For vXi16 cases we need to use pmovmksb and extract every other
46916 // sign bit.
46917 SDLoc DL(EFLAGS);
46918 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
46919 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
46920 MVT FloatVT =
46921 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
46922 Res = DAG.getBitcast(FloatVT, Res);
46923 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
46924 } else if (EltBits == 16) {
46925 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
46926 Res = DAG.getBitcast(MovmskVT, Res);
46927 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46928 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
46929 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46930 } else {
46931 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46932 }
46933 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
46934 DAG.getConstant(0, DL, MVT::i32));
46935 }
46936 }
46937 }
46938 }
46939
46940 // TESTZ(-1,X) == TESTZ(X,X)
46942 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
46943
46944 // TESTZ(X,-1) == TESTZ(X,X)
46946 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
46947
46948 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
46949 // TODO: Add COND_NE handling?
46950 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
46951 SDValue Src0 = peekThroughBitcasts(Op0);
46952 SDValue Src1 = peekThroughBitcasts(Op1);
46953 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
46955 peekThroughBitcasts(Src0.getOperand(1)), true);
46957 peekThroughBitcasts(Src1.getOperand(1)), true);
46958 if (Src0 && Src1) {
46959 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
46960 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46961 DAG.getBitcast(OpVT2, Src0),
46962 DAG.getBitcast(OpVT2, Src1));
46963 }
46964 }
46965 }
46966 }
46967
46968 return SDValue();
46969}
46970
46971// Attempt to simplify the MOVMSK input based on the comparison type.
46973 SelectionDAG &DAG,
46974 const X86Subtarget &Subtarget) {
46975 // Handle eq/ne against zero (any_of).
46976 // Handle eq/ne against -1 (all_of).
46977 if (!(CC == X86::COND_E || CC == X86::COND_NE))
46978 return SDValue();
46979 if (EFLAGS.getValueType() != MVT::i32)
46980 return SDValue();
46981 unsigned CmpOpcode = EFLAGS.getOpcode();
46982 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
46983 return SDValue();
46984 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
46985 if (!CmpConstant)
46986 return SDValue();
46987 const APInt &CmpVal = CmpConstant->getAPIntValue();
46988
46989 SDValue CmpOp = EFLAGS.getOperand(0);
46990 unsigned CmpBits = CmpOp.getValueSizeInBits();
46991 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
46992
46993 // Peek through any truncate.
46994 if (CmpOp.getOpcode() == ISD::TRUNCATE)
46995 CmpOp = CmpOp.getOperand(0);
46996
46997 // Bail if we don't find a MOVMSK.
46998 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
46999 return SDValue();
47000
47001 SDValue Vec = CmpOp.getOperand(0);
47002 MVT VecVT = Vec.getSimpleValueType();
47003 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
47004 "Unexpected MOVMSK operand");
47005 unsigned NumElts = VecVT.getVectorNumElements();
47006 unsigned NumEltBits = VecVT.getScalarSizeInBits();
47007
47008 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
47009 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
47010 NumElts <= CmpBits && CmpVal.isMask(NumElts);
47011 if (!IsAnyOf && !IsAllOf)
47012 return SDValue();
47013
47014 // TODO: Check more combining cases for me.
47015 // Here we check the cmp use number to decide do combining or not.
47016 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
47017 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
47018 bool IsOneUse = CmpOp.getNode()->hasOneUse();
47019
47020 // See if we can peek through to a vector with a wider element type, if the
47021 // signbits extend down to all the sub-elements as well.
47022 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
47023 // potential SimplifyDemandedBits/Elts cases.
47024 // If we looked through a truncate that discard bits, we can't do this
47025 // transform.
47026 // FIXME: We could do this transform for truncates that discarded bits by
47027 // inserting an AND mask between the new MOVMSK and the CMP.
47028 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
47029 SDValue BC = peekThroughBitcasts(Vec);
47030 MVT BCVT = BC.getSimpleValueType();
47031 unsigned BCNumElts = BCVT.getVectorNumElements();
47032 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
47033 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
47034 BCNumEltBits > NumEltBits &&
47035 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
47036 SDLoc DL(EFLAGS);
47037 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
47038 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
47039 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
47040 DAG.getConstant(CmpMask, DL, MVT::i32));
47041 }
47042 }
47043
47044 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
47045 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
47046 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
47047 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
47048 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
47050 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
47051 Ops.size() == 2) {
47052 SDLoc DL(EFLAGS);
47053 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
47054 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
47055 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
47056 DAG.getBitcast(SubVT, Ops[0]),
47057 DAG.getBitcast(SubVT, Ops[1]));
47058 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
47059 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
47060 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
47061 DAG.getConstant(CmpMask, DL, MVT::i32));
47062 }
47063 }
47064
47065 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
47066 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
47067 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
47068 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
47069 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
47070 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
47071 SDValue BC = peekThroughBitcasts(Vec);
47072 // Ensure MOVMSK was testing every signbit of BC.
47073 if (BC.getValueType().getVectorNumElements() <= NumElts) {
47074 if (BC.getOpcode() == X86ISD::PCMPEQ) {
47075 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
47076 BC.getOperand(0), BC.getOperand(1));
47077 V = DAG.getBitcast(TestVT, V);
47078 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47079 }
47080 // Check for 256-bit split vector cases.
47081 if (BC.getOpcode() == ISD::AND &&
47082 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
47083 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
47084 SDValue LHS = BC.getOperand(0);
47085 SDValue RHS = BC.getOperand(1);
47086 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
47087 LHS.getOperand(0), LHS.getOperand(1));
47088 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
47089 RHS.getOperand(0), RHS.getOperand(1));
47090 LHS = DAG.getBitcast(TestVT, LHS);
47091 RHS = DAG.getBitcast(TestVT, RHS);
47092 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
47093 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47094 }
47095 }
47096 }
47097
47098 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
47099 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
47100 // sign bits prior to the comparison with zero unless we know that
47101 // the vXi16 splats the sign bit down to the lower i8 half.
47102 // TODO: Handle all_of patterns.
47103 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
47104 SDValue VecOp0 = Vec.getOperand(0);
47105 SDValue VecOp1 = Vec.getOperand(1);
47106 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
47107 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
47108 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
47109 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
47110 SDLoc DL(EFLAGS);
47111 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
47112 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47113 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
47114 if (!SignExt0) {
47115 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
47116 DAG.getConstant(0xAAAA, DL, MVT::i16));
47117 }
47118 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47119 DAG.getConstant(0, DL, MVT::i16));
47120 }
47121 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
47122 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
47123 if (CmpBits >= 16 && Subtarget.hasInt256() &&
47124 (IsAnyOf || (SignExt0 && SignExt1))) {
47125 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
47126 SDLoc DL(EFLAGS);
47127 SDValue Result = peekThroughBitcasts(Src);
47128 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
47129 Result.getValueType().getVectorNumElements() <= NumElts) {
47130 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
47131 Result.getOperand(0), Result.getOperand(1));
47132 V = DAG.getBitcast(MVT::v4i64, V);
47133 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
47134 }
47135 Result = DAG.getBitcast(MVT::v32i8, Result);
47136 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47137 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
47138 if (!SignExt0 || !SignExt1) {
47139 assert(IsAnyOf &&
47140 "Only perform v16i16 signmasks for any_of patterns");
47141 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
47142 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
47143 }
47144 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
47145 DAG.getConstant(CmpMask, DL, MVT::i32));
47146 }
47147 }
47148 }
47149
47150 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
47151 // Since we peek through a bitcast, we need to be careful if the base vector
47152 // type has smaller elements than the MOVMSK type. In that case, even if
47153 // all the elements are demanded by the shuffle mask, only the "high"
47154 // elements which have highbits that align with highbits in the MOVMSK vec
47155 // elements are actually demanded. A simplification of spurious operations
47156 // on the "low" elements take place during other simplifications.
47157 //
47158 // For example:
47159 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
47160 // demanded, because we are swapping around the result can change.
47161 //
47162 // To address this, we check that we can scale the shuffle mask to MOVMSK
47163 // element width (this will ensure "high" elements match). Its slightly overly
47164 // conservative, but fine for an edge case fold.
47165 SmallVector<int, 32> ShuffleMask;
47166 SmallVector<SDValue, 2> ShuffleInputs;
47167 if (NumElts <= CmpBits &&
47168 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
47169 ShuffleMask, DAG) &&
47170 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
47171 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
47172 canScaleShuffleElements(ShuffleMask, NumElts)) {
47173 SDLoc DL(EFLAGS);
47174 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
47175 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47176 Result =
47177 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
47178 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
47179 }
47180
47181 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
47182 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
47183 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
47184 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
47185 // iff every element is referenced.
47186 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
47187 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
47188 (NumEltBits == 32 || NumEltBits == 64)) {
47189 SDLoc DL(EFLAGS);
47190 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
47191 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
47192 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
47193 SDValue LHS = Vec;
47194 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
47195 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47196 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
47197 DAG.getBitcast(FloatVT, LHS),
47198 DAG.getBitcast(FloatVT, RHS));
47199 }
47200
47201 return SDValue();
47202}
47203
47204/// Optimize an EFLAGS definition used according to the condition code \p CC
47205/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
47206/// uses of chain values.
47208 SelectionDAG &DAG,
47209 const X86Subtarget &Subtarget) {
47210 if (CC == X86::COND_B)
47211 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
47212 return Flags;
47213
47214 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
47215 return R;
47216
47217 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
47218 return R;
47219
47220 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
47221 return R;
47222
47223 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
47224 return R;
47225
47226 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
47227}
47228
47229/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
47232 const X86Subtarget &Subtarget) {
47233 SDLoc DL(N);
47234
47235 SDValue FalseOp = N->getOperand(0);
47236 SDValue TrueOp = N->getOperand(1);
47237 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
47238 SDValue Cond = N->getOperand(3);
47239
47240 // cmov X, X, ?, ? --> X
47241 if (TrueOp == FalseOp)
47242 return TrueOp;
47243
47244 // Try to simplify the EFLAGS and condition code operands.
47245 // We can't always do this as FCMOV only supports a subset of X86 cond.
47246 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
47247 if (!(FalseOp.getValueType() == MVT::f80 ||
47248 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
47249 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
47250 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
47251 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
47252 Flags};
47253 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47254 }
47255 }
47256
47257 // If this is a select between two integer constants, try to do some
47258 // optimizations. Note that the operands are ordered the opposite of SELECT
47259 // operands.
47260 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
47261 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
47262 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
47263 // larger than FalseC (the false value).
47264 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
47266 std::swap(TrueC, FalseC);
47267 std::swap(TrueOp, FalseOp);
47268 }
47269
47270 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
47271 // This is efficient for any integer data type (including i8/i16) and
47272 // shift amount.
47273 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
47274 Cond = getSETCC(CC, Cond, DL, DAG);
47275
47276 // Zero extend the condition if needed.
47277 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
47278
47279 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
47280 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
47281 DAG.getConstant(ShAmt, DL, MVT::i8));
47282 return Cond;
47283 }
47284
47285 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
47286 // for any integer data type, including i8/i16.
47287 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
47288 Cond = getSETCC(CC, Cond, DL, DAG);
47289
47290 // Zero extend the condition if needed.
47292 FalseC->getValueType(0), Cond);
47293 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
47294 SDValue(FalseC, 0));
47295 return Cond;
47296 }
47297
47298 // Optimize cases that will turn into an LEA instruction. This requires
47299 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
47300 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
47301 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
47302 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
47303 "Implicit constant truncation");
47304
47305 bool isFastMultiplier = false;
47306 if (Diff.ult(10)) {
47307 switch (Diff.getZExtValue()) {
47308 default: break;
47309 case 1: // result = add base, cond
47310 case 2: // result = lea base( , cond*2)
47311 case 3: // result = lea base(cond, cond*2)
47312 case 4: // result = lea base( , cond*4)
47313 case 5: // result = lea base(cond, cond*4)
47314 case 8: // result = lea base( , cond*8)
47315 case 9: // result = lea base(cond, cond*8)
47316 isFastMultiplier = true;
47317 break;
47318 }
47319 }
47320
47321 if (isFastMultiplier) {
47322 Cond = getSETCC(CC, Cond, DL ,DAG);
47323 // Zero extend the condition if needed.
47324 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
47325 Cond);
47326 // Scale the condition by the difference.
47327 if (Diff != 1)
47328 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
47329 DAG.getConstant(Diff, DL, Cond.getValueType()));
47330
47331 // Add the base if non-zero.
47332 if (FalseC->getAPIntValue() != 0)
47333 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
47334 SDValue(FalseC, 0));
47335 return Cond;
47336 }
47337 }
47338 }
47339 }
47340
47341 // Handle these cases:
47342 // (select (x != c), e, c) -> select (x != c), e, x),
47343 // (select (x == c), c, e) -> select (x == c), x, e)
47344 // where the c is an integer constant, and the "select" is the combination
47345 // of CMOV and CMP.
47346 //
47347 // The rationale for this change is that the conditional-move from a constant
47348 // needs two instructions, however, conditional-move from a register needs
47349 // only one instruction.
47350 //
47351 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
47352 // some instruction-combining opportunities. This opt needs to be
47353 // postponed as late as possible.
47354 //
47355 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
47356 // the DCI.xxxx conditions are provided to postpone the optimization as
47357 // late as possible.
47358
47359 ConstantSDNode *CmpAgainst = nullptr;
47360 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
47361 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
47362 !isa<ConstantSDNode>(Cond.getOperand(0))) {
47363
47364 if (CC == X86::COND_NE &&
47365 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
47367 std::swap(TrueOp, FalseOp);
47368 }
47369
47370 if (CC == X86::COND_E &&
47371 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
47372 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
47373 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
47374 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47375 }
47376 }
47377 }
47378
47379 // Transform:
47380 //
47381 // (cmov 1 T (uge T 2))
47382 //
47383 // to:
47384 //
47385 // (adc T 0 (sub T 1))
47386 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
47387 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
47388 SDValue Cond0 = Cond.getOperand(0);
47389 if (Cond0.getOpcode() == ISD::TRUNCATE)
47390 Cond0 = Cond0.getOperand(0);
47391 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
47392 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
47393 EVT CondVT = Cond->getValueType(0);
47394 EVT OuterVT = N->getValueType(0);
47395 // Subtract 1 and generate a carry.
47396 SDValue NewSub =
47397 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
47398 DAG.getConstant(1, DL, CondVT));
47399 SDValue EFLAGS(NewSub.getNode(), 1);
47400 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
47401 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
47402 }
47403 }
47404
47405 // Fold and/or of setcc's to double CMOV:
47406 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
47407 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
47408 //
47409 // This combine lets us generate:
47410 // cmovcc1 (jcc1 if we don't have CMOV)
47411 // cmovcc2 (same)
47412 // instead of:
47413 // setcc1
47414 // setcc2
47415 // and/or
47416 // cmovne (jne if we don't have CMOV)
47417 // When we can't use the CMOV instruction, it might increase branch
47418 // mispredicts.
47419 // When we can use CMOV, or when there is no mispredict, this improves
47420 // throughput and reduces register pressure.
47421 //
47422 if (CC == X86::COND_NE) {
47423 SDValue Flags;
47424 X86::CondCode CC0, CC1;
47425 bool isAndSetCC;
47426 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
47427 if (isAndSetCC) {
47428 std::swap(FalseOp, TrueOp);
47431 }
47432
47433 SDValue LOps[] = {FalseOp, TrueOp,
47434 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
47435 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
47436 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
47437 Flags};
47438 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47439 return CMOV;
47440 }
47441 }
47442
47443 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
47444 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
47445 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
47446 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
47447 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
47448 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
47449 SDValue Add = TrueOp;
47450 SDValue Const = FalseOp;
47451 // Canonicalize the condition code for easier matching and output.
47452 if (CC == X86::COND_E)
47453 std::swap(Add, Const);
47454
47455 // We might have replaced the constant in the cmov with the LHS of the
47456 // compare. If so change it to the RHS of the compare.
47457 if (Const == Cond.getOperand(0))
47458 Const = Cond.getOperand(1);
47459
47460 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
47461 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
47462 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
47463 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
47464 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
47465 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
47466 EVT VT = N->getValueType(0);
47467 // This should constant fold.
47468 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
47469 SDValue CMov =
47470 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
47471 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
47472 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
47473 }
47474 }
47475
47476 return SDValue();
47477}
47478
47479/// Different mul shrinking modes.
47481
47483 EVT VT = N->getOperand(0).getValueType();
47484 if (VT.getScalarSizeInBits() != 32)
47485 return false;
47486
47487 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
47488 unsigned SignBits[2] = {1, 1};
47489 bool IsPositive[2] = {false, false};
47490 for (unsigned i = 0; i < 2; i++) {
47491 SDValue Opd = N->getOperand(i);
47492
47493 SignBits[i] = DAG.ComputeNumSignBits(Opd);
47494 IsPositive[i] = DAG.SignBitIsZero(Opd);
47495 }
47496
47497 bool AllPositive = IsPositive[0] && IsPositive[1];
47498 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
47499 // When ranges are from -128 ~ 127, use MULS8 mode.
47500 if (MinSignBits >= 25)
47501 Mode = ShrinkMode::MULS8;
47502 // When ranges are from 0 ~ 255, use MULU8 mode.
47503 else if (AllPositive && MinSignBits >= 24)
47504 Mode = ShrinkMode::MULU8;
47505 // When ranges are from -32768 ~ 32767, use MULS16 mode.
47506 else if (MinSignBits >= 17)
47507 Mode = ShrinkMode::MULS16;
47508 // When ranges are from 0 ~ 65535, use MULU16 mode.
47509 else if (AllPositive && MinSignBits >= 16)
47510 Mode = ShrinkMode::MULU16;
47511 else
47512 return false;
47513 return true;
47514}
47515
47516/// When the operands of vector mul are extended from smaller size values,
47517/// like i8 and i16, the type of mul may be shrinked to generate more
47518/// efficient code. Two typical patterns are handled:
47519/// Pattern1:
47520/// %2 = sext/zext <N x i8> %1 to <N x i32>
47521/// %4 = sext/zext <N x i8> %3 to <N x i32>
47522// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47523/// %5 = mul <N x i32> %2, %4
47524///
47525/// Pattern2:
47526/// %2 = zext/sext <N x i16> %1 to <N x i32>
47527/// %4 = zext/sext <N x i16> %3 to <N x i32>
47528/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47529/// %5 = mul <N x i32> %2, %4
47530///
47531/// There are four mul shrinking modes:
47532/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
47533/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
47534/// generate pmullw+sext32 for it (MULS8 mode).
47535/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
47536/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
47537/// generate pmullw+zext32 for it (MULU8 mode).
47538/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
47539/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
47540/// generate pmullw+pmulhw for it (MULS16 mode).
47541/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
47542/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
47543/// generate pmullw+pmulhuw for it (MULU16 mode).
47545 const X86Subtarget &Subtarget) {
47546 // Check for legality
47547 // pmullw/pmulhw are not supported by SSE.
47548 if (!Subtarget.hasSSE2())
47549 return SDValue();
47550
47551 // Check for profitability
47552 // pmulld is supported since SSE41. It is better to use pmulld
47553 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
47554 // the expansion.
47555 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
47556 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
47557 return SDValue();
47558
47559 ShrinkMode Mode;
47560 if (!canReduceVMulWidth(N, DAG, Mode))
47561 return SDValue();
47562
47563 SDValue N0 = N->getOperand(0);
47564 SDValue N1 = N->getOperand(1);
47565 EVT VT = N->getOperand(0).getValueType();
47566 unsigned NumElts = VT.getVectorNumElements();
47567 if ((NumElts % 2) != 0)
47568 return SDValue();
47569
47570 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
47571
47572 // Shrink the operands of mul.
47573 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
47574 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
47575
47576 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
47577 // lower part is needed.
47578 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
47579 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
47580 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
47582 DL, VT, MulLo);
47583
47584 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
47585 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
47586 // the higher part is also needed.
47587 SDValue MulHi =
47588 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
47589 ReducedVT, NewN0, NewN1);
47590
47591 // Repack the lower part and higher part result of mul into a wider
47592 // result.
47593 // Generate shuffle functioning as punpcklwd.
47594 SmallVector<int, 16> ShuffleMask(NumElts);
47595 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
47596 ShuffleMask[2 * i] = i;
47597 ShuffleMask[2 * i + 1] = i + NumElts;
47598 }
47599 SDValue ResLo =
47600 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
47601 ResLo = DAG.getBitcast(ResVT, ResLo);
47602 // Generate shuffle functioning as punpckhwd.
47603 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
47604 ShuffleMask[2 * i] = i + NumElts / 2;
47605 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
47606 }
47607 SDValue ResHi =
47608 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
47609 ResHi = DAG.getBitcast(ResVT, ResHi);
47610 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
47611}
47612
47614 EVT VT, const SDLoc &DL) {
47615
47616 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
47617 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47618 DAG.getConstant(Mult, DL, VT));
47619 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
47620 DAG.getConstant(Shift, DL, MVT::i8));
47621 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
47622 N->getOperand(0));
47623 return Result;
47624 };
47625
47626 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
47627 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47628 DAG.getConstant(Mul1, DL, VT));
47629 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
47630 DAG.getConstant(Mul2, DL, VT));
47631 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
47632 N->getOperand(0));
47633 return Result;
47634 };
47635
47636 switch (MulAmt) {
47637 default:
47638 break;
47639 case 11:
47640 // mul x, 11 => add ((shl (mul x, 5), 1), x)
47641 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
47642 case 21:
47643 // mul x, 21 => add ((shl (mul x, 5), 2), x)
47644 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
47645 case 41:
47646 // mul x, 41 => add ((shl (mul x, 5), 3), x)
47647 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
47648 case 22:
47649 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
47650 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47651 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
47652 case 19:
47653 // mul x, 19 => add ((shl (mul x, 9), 1), x)
47654 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
47655 case 37:
47656 // mul x, 37 => add ((shl (mul x, 9), 2), x)
47657 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
47658 case 73:
47659 // mul x, 73 => add ((shl (mul x, 9), 3), x)
47660 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
47661 case 13:
47662 // mul x, 13 => add ((shl (mul x, 3), 2), x)
47663 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
47664 case 23:
47665 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
47666 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
47667 case 26:
47668 // mul x, 26 => add ((mul (mul x, 5), 5), x)
47669 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
47670 case 28:
47671 // mul x, 28 => add ((mul (mul x, 9), 3), x)
47672 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
47673 case 29:
47674 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
47675 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47676 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
47677 }
47678
47679 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
47680 // by a single LEA.
47681 // First check if this a sum of two power of 2s because that's easy. Then
47682 // count how many zeros are up to the first bit.
47683 // TODO: We can do this even without LEA at a cost of two shifts and an add.
47684 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
47685 unsigned ScaleShift = llvm::countr_zero(MulAmt);
47686 if (ScaleShift >= 1 && ScaleShift < 4) {
47687 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
47688 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47689 DAG.getConstant(ShiftAmt, DL, MVT::i8));
47690 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47691 DAG.getConstant(ScaleShift, DL, MVT::i8));
47692 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
47693 }
47694 }
47695
47696 return SDValue();
47697}
47698
47699// If the upper 17 bits of either element are zero and the other element are
47700// zero/sign bits then we can use PMADDWD, which is always at least as quick as
47701// PMULLD, except on KNL.
47703 SelectionDAG &DAG,
47704 const X86Subtarget &Subtarget) {
47705 if (!Subtarget.hasSSE2())
47706 return SDValue();
47707
47708 if (Subtarget.isPMADDWDSlow())
47709 return SDValue();
47710
47711 EVT VT = N->getValueType(0);
47712
47713 // Only support vXi32 vectors.
47714 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
47715 return SDValue();
47716
47717 // Make sure the type is legal or can split/widen to a legal type.
47718 // With AVX512 but without BWI, we would need to split v32i16.
47719 unsigned NumElts = VT.getVectorNumElements();
47720 if (NumElts == 1 || !isPowerOf2_32(NumElts))
47721 return SDValue();
47722
47723 // With AVX512 but without BWI, we would need to split v32i16.
47724 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
47725 return SDValue();
47726
47727 SDValue N0 = N->getOperand(0);
47728 SDValue N1 = N->getOperand(1);
47729
47730 // If we are zero/sign extending two steps without SSE4.1, its better to
47731 // reduce the vmul width instead.
47732 if (!Subtarget.hasSSE41() &&
47733 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
47734 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47735 (N1.getOpcode() == ISD::ZERO_EXTEND &&
47736 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
47737 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
47738 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47739 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47740 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
47741 return SDValue();
47742
47743 // If we are sign extending a wide vector without SSE4.1, its better to reduce
47744 // the vmul width instead.
47745 if (!Subtarget.hasSSE41() &&
47746 (N0.getOpcode() == ISD::SIGN_EXTEND &&
47747 N0.getOperand(0).getValueSizeInBits() > 128) &&
47748 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47749 N1.getOperand(0).getValueSizeInBits() > 128))
47750 return SDValue();
47751
47752 // Sign bits must extend down to the lowest i16.
47753 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
47754 DAG.ComputeMaxSignificantBits(N0) > 16)
47755 return SDValue();
47756
47757 // At least one of the elements must be zero in the upper 17 bits, or can be
47758 // safely made zero without altering the final result.
47759 auto GetZeroableOp = [&](SDValue Op) {
47760 APInt Mask17 = APInt::getHighBitsSet(32, 17);
47761 if (DAG.MaskedValueIsZero(Op, Mask17))
47762 return Op;
47763 // Mask off upper 16-bits of sign-extended constants.
47765 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
47766 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
47767 SDValue Src = Op.getOperand(0);
47768 // Convert sext(vXi16) to zext(vXi16).
47769 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
47770 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
47771 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
47772 // which will expand the extension.
47773 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
47774 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
47775 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
47776 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
47777 }
47778 }
47779 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
47780 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
47781 N->isOnlyUserOf(Op.getNode())) {
47782 SDValue Src = Op.getOperand(0);
47783 if (Src.getScalarValueSizeInBits() == 16)
47784 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
47785 }
47786 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
47787 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
47788 N->isOnlyUserOf(Op.getNode())) {
47789 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
47790 Op.getOperand(1));
47791 }
47792 return SDValue();
47793 };
47794 SDValue ZeroN0 = GetZeroableOp(N0);
47795 SDValue ZeroN1 = GetZeroableOp(N1);
47796 if (!ZeroN0 && !ZeroN1)
47797 return SDValue();
47798 N0 = ZeroN0 ? ZeroN0 : N0;
47799 N1 = ZeroN1 ? ZeroN1 : N1;
47800
47801 // Use SplitOpsAndApply to handle AVX splitting.
47802 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47803 ArrayRef<SDValue> Ops) {
47804 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
47805 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
47806 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
47807 DAG.getBitcast(OpVT, Ops[0]),
47808 DAG.getBitcast(OpVT, Ops[1]));
47809 };
47810 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
47811}
47812
47814 const X86Subtarget &Subtarget) {
47815 if (!Subtarget.hasSSE2())
47816 return SDValue();
47817
47818 EVT VT = N->getValueType(0);
47819
47820 // Only support vXi64 vectors.
47821 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
47822 VT.getVectorNumElements() < 2 ||
47824 return SDValue();
47825
47826 SDValue N0 = N->getOperand(0);
47827 SDValue N1 = N->getOperand(1);
47828
47829 // MULDQ returns the 64-bit result of the signed multiplication of the lower
47830 // 32-bits. We can lower with this if the sign bits stretch that far.
47831 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
47832 DAG.ComputeNumSignBits(N1) > 32) {
47833 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47834 ArrayRef<SDValue> Ops) {
47835 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
47836 };
47837 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
47838 /*CheckBWI*/ false);
47839 }
47840
47841 // If the upper bits are zero we can use a single pmuludq.
47842 APInt Mask = APInt::getHighBitsSet(64, 32);
47843 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
47844 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47845 ArrayRef<SDValue> Ops) {
47846 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
47847 };
47848 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
47849 /*CheckBWI*/ false);
47850 }
47851
47852 return SDValue();
47853}
47854
47857 const X86Subtarget &Subtarget) {
47858 EVT VT = N->getValueType(0);
47859 SDLoc DL(N);
47860
47861 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
47862 return V;
47863
47864 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
47865 return V;
47866
47867 if (DCI.isBeforeLegalize() && VT.isVector())
47868 return reduceVMULWidth(N, DL, DAG, Subtarget);
47869
47870 // Optimize a single multiply with constant into two operations in order to
47871 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
47873 return SDValue();
47874
47875 // An imul is usually smaller than the alternative sequence.
47877 return SDValue();
47878
47879 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
47880 return SDValue();
47881
47882 if (VT != MVT::i64 && VT != MVT::i32 &&
47883 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
47884 return SDValue();
47885
47887 N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false);
47888 const APInt *C = nullptr;
47889 if (!CNode) {
47890 if (VT.isVector())
47891 if (auto *RawC = getTargetConstantFromNode(N->getOperand(1)))
47892 if (auto *SplatC = RawC->getSplatValue())
47893 C = &(SplatC->getUniqueInteger());
47894
47895 if (!C || C->getBitWidth() != VT.getScalarSizeInBits())
47896 return SDValue();
47897 } else {
47898 C = &(CNode->getAPIntValue());
47899 }
47900
47901 if (isPowerOf2_64(C->getZExtValue()))
47902 return SDValue();
47903
47904 int64_t SignMulAmt = C->getSExtValue();
47905 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
47906 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
47907
47908 SDValue NewMul = SDValue();
47909 if (VT == MVT::i64 || VT == MVT::i32) {
47910 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
47911 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47912 DAG.getConstant(AbsMulAmt, DL, VT));
47913 if (SignMulAmt < 0)
47914 NewMul = DAG.getNegative(NewMul, DL, VT);
47915
47916 return NewMul;
47917 }
47918
47919 uint64_t MulAmt1 = 0;
47920 uint64_t MulAmt2 = 0;
47921 if ((AbsMulAmt % 9) == 0) {
47922 MulAmt1 = 9;
47923 MulAmt2 = AbsMulAmt / 9;
47924 } else if ((AbsMulAmt % 5) == 0) {
47925 MulAmt1 = 5;
47926 MulAmt2 = AbsMulAmt / 5;
47927 } else if ((AbsMulAmt % 3) == 0) {
47928 MulAmt1 = 3;
47929 MulAmt2 = AbsMulAmt / 3;
47930 }
47931
47932 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
47933 if (MulAmt2 &&
47934 (isPowerOf2_64(MulAmt2) ||
47935 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
47936
47937 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
47938 N->use_begin()->getOpcode() == ISD::ADD))
47939 // If second multiplifer is pow2, issue it first. We want the multiply
47940 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
47941 // use is an add. Only do this for positive multiply amounts since the
47942 // negate would prevent it from being used as an address mode anyway.
47943 std::swap(MulAmt1, MulAmt2);
47944
47945 if (isPowerOf2_64(MulAmt1))
47946 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47947 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
47948 else
47949 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47950 DAG.getConstant(MulAmt1, DL, VT));
47951
47952 if (isPowerOf2_64(MulAmt2))
47953 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
47954 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
47955 else
47956 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
47957 DAG.getConstant(MulAmt2, DL, VT));
47958
47959 // Negate the result.
47960 if (SignMulAmt < 0)
47961 NewMul = DAG.getNegative(NewMul, DL, VT);
47962 } else if (!Subtarget.slowLEA())
47963 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
47964 }
47965 if (!NewMul) {
47966 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
47967 assert(C->getZExtValue() != 0 &&
47968 C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) &&
47969 "Both cases that could cause potential overflows should have "
47970 "already been handled.");
47971 if (isPowerOf2_64(AbsMulAmt - 1)) {
47972 // (mul x, 2^N + 1) => (add (shl x, N), x)
47973 NewMul = DAG.getNode(
47974 ISD::ADD, DL, VT, N->getOperand(0),
47975 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47976 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
47977 if (SignMulAmt < 0)
47978 NewMul = DAG.getNegative(NewMul, DL, VT);
47979 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
47980 // (mul x, 2^N - 1) => (sub (shl x, N), x)
47981 NewMul =
47982 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47983 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
47984 // To negate, reverse the operands of the subtract.
47985 if (SignMulAmt < 0)
47986 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
47987 else
47988 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
47989 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
47990 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
47991 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
47992 NewMul =
47993 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47994 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
47995 NewMul = DAG.getNode(
47996 ISD::ADD, DL, VT, NewMul,
47997 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47998 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
47999 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
48000 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
48001 NewMul =
48002 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48003 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
48004 NewMul = DAG.getNode(
48005 ISD::SUB, DL, VT, NewMul,
48006 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48007 } else if (SignMulAmt >= 0 && VT.isVector() &&
48008 Subtarget.fastImmVectorShift()) {
48009 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
48010 uint64_t ShiftAmt1;
48011 std::optional<unsigned> Opc;
48012 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
48013 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
48014 Opc = ISD::ADD;
48015 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
48016 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
48017 Opc = ISD::SUB;
48018 }
48019
48020 if (Opc) {
48021 SDValue Shift1 =
48022 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48023 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
48024 SDValue Shift2 =
48025 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48026 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
48027 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
48028 }
48029 }
48030 }
48031
48032 return NewMul;
48033}
48034
48035// Try to form a MULHU or MULHS node by looking for
48036// (srl (mul ext, ext), 16)
48037// TODO: This is X86 specific because we want to be able to handle wide types
48038// before type legalization. But we can only do it if the vector will be
48039// legalized via widening/splitting. Type legalization can't handle promotion
48040// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
48041// combiner.
48043 const SDLoc &DL,
48044 const X86Subtarget &Subtarget) {
48045 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
48046 "SRL or SRA node is required here!");
48047
48048 if (!Subtarget.hasSSE2())
48049 return SDValue();
48050
48051 // The operation feeding into the shift must be a multiply.
48052 SDValue ShiftOperand = N->getOperand(0);
48053 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
48054 return SDValue();
48055
48056 // Input type should be at least vXi32.
48057 EVT VT = N->getValueType(0);
48058 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
48059 return SDValue();
48060
48061 // Need a shift by 16.
48062 APInt ShiftAmt;
48063 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
48064 ShiftAmt != 16)
48065 return SDValue();
48066
48067 SDValue LHS = ShiftOperand.getOperand(0);
48068 SDValue RHS = ShiftOperand.getOperand(1);
48069
48070 unsigned ExtOpc = LHS.getOpcode();
48071 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
48072 RHS.getOpcode() != ExtOpc)
48073 return SDValue();
48074
48075 // Peek through the extends.
48076 LHS = LHS.getOperand(0);
48077 RHS = RHS.getOperand(0);
48078
48079 // Ensure the input types match.
48080 EVT MulVT = LHS.getValueType();
48081 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
48082 return SDValue();
48083
48084 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
48085 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
48086
48087 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48088 return DAG.getNode(ExtOpc, DL, VT, Mulh);
48089}
48090
48092 const X86Subtarget &Subtarget) {
48093 using namespace llvm::SDPatternMatch;
48094 SDValue N0 = N->getOperand(0);
48095 SDValue N1 = N->getOperand(1);
48096 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
48097 EVT VT = N0.getValueType();
48098 unsigned EltSizeInBits = VT.getScalarSizeInBits();
48099 SDLoc DL(N);
48100
48101 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48102 // with out-of-bounds clamping.
48103 if (N0.getOpcode() == ISD::VSELECT &&
48104 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
48105 SDValue Cond = N0.getOperand(0);
48106 SDValue N00 = N0.getOperand(1);
48107 SDValue N01 = N0.getOperand(2);
48108 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
48110 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
48111 m_SpecificCondCode(ISD::SETULT)))) {
48112 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
48113 }
48114 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
48116 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
48117 m_SpecificCondCode(ISD::SETUGE)))) {
48118 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
48119 }
48120 }
48121
48122 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
48123 // since the result of setcc_c is all zero's or all ones.
48124 if (VT.isInteger() && !VT.isVector() &&
48125 N1C && N0.getOpcode() == ISD::AND &&
48126 N0.getOperand(1).getOpcode() == ISD::Constant) {
48127 SDValue N00 = N0.getOperand(0);
48128 APInt Mask = N0.getConstantOperandAPInt(1);
48129 Mask <<= N1C->getAPIntValue();
48130 bool MaskOK = false;
48131 // We can handle cases concerning bit-widening nodes containing setcc_c if
48132 // we carefully interrogate the mask to make sure we are semantics
48133 // preserving.
48134 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
48135 // of the underlying setcc_c operation if the setcc_c was zero extended.
48136 // Consider the following example:
48137 // zext(setcc_c) -> i32 0x0000FFFF
48138 // c1 -> i32 0x0000FFFF
48139 // c2 -> i32 0x00000001
48140 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
48141 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
48142 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
48143 MaskOK = true;
48144 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
48146 MaskOK = true;
48147 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
48148 N00.getOpcode() == ISD::ANY_EXTEND) &&
48150 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
48151 }
48152 if (MaskOK && Mask != 0)
48153 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
48154 }
48155
48156 return SDValue();
48157}
48158
48160 const X86Subtarget &Subtarget) {
48161 using namespace llvm::SDPatternMatch;
48162 SDValue N0 = N->getOperand(0);
48163 SDValue N1 = N->getOperand(1);
48164 EVT VT = N0.getValueType();
48165 unsigned Size = VT.getSizeInBits();
48166 SDLoc DL(N);
48167
48168 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
48169 return V;
48170
48171 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
48172 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
48173 SDValue ShrAmtVal;
48174 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
48176 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
48177 }
48178
48179 // fold (SRA (SHL X, ShlConst), SraConst)
48180 // into (SHL (sext_in_reg X), ShlConst - SraConst)
48181 // or (sext_in_reg X)
48182 // or (SRA (sext_in_reg X), SraConst - ShlConst)
48183 // depending on relation between SraConst and ShlConst.
48184 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
48185 // us to do the sext_in_reg from corresponding bit.
48186
48187 // sexts in X86 are MOVs. The MOVs have the same code size
48188 // as above SHIFTs (only SHIFT on 1 has lower code size).
48189 // However the MOVs have 2 advantages to a SHIFT:
48190 // 1. MOVs can write to a register that differs from source
48191 // 2. MOVs accept memory operands
48192
48193 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
48194 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
48196 return SDValue();
48197
48198 SDValue N00 = N0.getOperand(0);
48199 SDValue N01 = N0.getOperand(1);
48200 APInt ShlConst = N01->getAsAPIntVal();
48201 APInt SraConst = N1->getAsAPIntVal();
48202 EVT CVT = N1.getValueType();
48203
48204 if (CVT != N01.getValueType())
48205 return SDValue();
48206 if (SraConst.isNegative())
48207 return SDValue();
48208
48209 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
48210 unsigned ShiftSize = SVT.getSizeInBits();
48211 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
48212 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
48213 continue;
48214 SDValue NN =
48215 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
48216 if (SraConst.eq(ShlConst))
48217 return NN;
48218 if (SraConst.ult(ShlConst))
48219 return DAG.getNode(ISD::SHL, DL, VT, NN,
48220 DAG.getConstant(ShlConst - SraConst, DL, CVT));
48221 return DAG.getNode(ISD::SRA, DL, VT, NN,
48222 DAG.getConstant(SraConst - ShlConst, DL, CVT));
48223 }
48224 return SDValue();
48225}
48226
48229 const X86Subtarget &Subtarget) {
48230 using namespace llvm::SDPatternMatch;
48231 SDValue N0 = N->getOperand(0);
48232 SDValue N1 = N->getOperand(1);
48233 EVT VT = N0.getValueType();
48234 unsigned EltSizeInBits = VT.getScalarSizeInBits();
48235 SDLoc DL(N);
48236
48237 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
48238 return V;
48239
48240 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48241 // with out-of-bounds clamping.
48242 if (N0.getOpcode() == ISD::VSELECT &&
48243 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
48244 SDValue Cond = N0.getOperand(0);
48245 SDValue N00 = N0.getOperand(1);
48246 SDValue N01 = N0.getOperand(2);
48247 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
48249 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
48250 m_SpecificCondCode(ISD::SETULT)))) {
48251 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
48252 }
48253 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
48255 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
48256 m_SpecificCondCode(ISD::SETUGE)))) {
48257 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
48258 }
48259 }
48260
48261 // Only do this on the last DAG combine as it can interfere with other
48262 // combines.
48263 if (!DCI.isAfterLegalizeDAG())
48264 return SDValue();
48265
48266 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
48267 // TODO: This is a generic DAG combine that became an x86-only combine to
48268 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
48269 // and-not ('andn').
48270 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
48271 return SDValue();
48272
48273 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
48274 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
48275 if (!ShiftC || !AndC)
48276 return SDValue();
48277
48278 // If we can shrink the constant mask below 8-bits or 32-bits, then this
48279 // transform should reduce code size. It may also enable secondary transforms
48280 // from improved known-bits analysis or instruction selection.
48281 APInt MaskVal = AndC->getAPIntValue();
48282
48283 // If this can be matched by a zero extend, don't optimize.
48284 if (MaskVal.isMask()) {
48285 unsigned TO = MaskVal.countr_one();
48286 if (TO >= 8 && isPowerOf2_32(TO))
48287 return SDValue();
48288 }
48289
48290 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
48291 unsigned OldMaskSize = MaskVal.getSignificantBits();
48292 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
48293 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
48294 (OldMaskSize > 32 && NewMaskSize <= 32)) {
48295 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
48296 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
48297 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
48298 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
48299 }
48300 return SDValue();
48301}
48302
48304 const X86Subtarget &Subtarget) {
48305 unsigned Opcode = N->getOpcode();
48306 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
48307
48308 SDLoc DL(N);
48309 EVT VT = N->getValueType(0);
48310 SDValue N0 = N->getOperand(0);
48311 SDValue N1 = N->getOperand(1);
48312 EVT SrcVT = N0.getValueType();
48313
48314 SDValue BC0 =
48315 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
48316 SDValue BC1 =
48317 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
48318
48319 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
48320 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
48321 // truncation trees that help us avoid lane crossing shuffles.
48322 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
48323 // TODO: We don't handle vXf64 shuffles yet.
48324 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48325 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
48327 SmallVector<int> ShuffleMask, ScaledMask;
48328 SDValue Vec = peekThroughBitcasts(BCSrc);
48329 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
48331 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
48332 // shuffle to a v4X64 width - we can probably relax this in the future.
48333 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
48334 ShuffleOps[0].getValueType().is256BitVector() &&
48335 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
48336 SDValue Lo, Hi;
48337 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
48338 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
48339 Lo = DAG.getBitcast(SrcVT, Lo);
48340 Hi = DAG.getBitcast(SrcVT, Hi);
48341 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
48342 Res = DAG.getBitcast(ShufVT, Res);
48343 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
48344 return DAG.getBitcast(VT, Res);
48345 }
48346 }
48347 }
48348 }
48349
48350 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
48351 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48352 // If either/both ops are a shuffle that can scale to v2x64,
48353 // then see if we can perform this as a v4x32 post shuffle.
48354 SmallVector<SDValue> Ops0, Ops1;
48355 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
48356 bool IsShuf0 =
48357 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
48358 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
48359 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48360 bool IsShuf1 =
48361 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
48362 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
48363 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48364 if (IsShuf0 || IsShuf1) {
48365 if (!IsShuf0) {
48366 Ops0.assign({BC0});
48367 ScaledMask0.assign({0, 1});
48368 }
48369 if (!IsShuf1) {
48370 Ops1.assign({BC1});
48371 ScaledMask1.assign({0, 1});
48372 }
48373
48374 SDValue LHS, RHS;
48375 int PostShuffle[4] = {-1, -1, -1, -1};
48376 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
48377 if (M < 0)
48378 return true;
48379 Idx = M % 2;
48380 SDValue Src = Ops[M / 2];
48381 if (!LHS || LHS == Src) {
48382 LHS = Src;
48383 return true;
48384 }
48385 if (!RHS || RHS == Src) {
48386 Idx += 2;
48387 RHS = Src;
48388 return true;
48389 }
48390 return false;
48391 };
48392 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
48393 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
48394 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
48395 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
48396 LHS = DAG.getBitcast(SrcVT, LHS);
48397 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
48398 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
48399 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
48400 Res = DAG.getBitcast(ShufVT, Res);
48401 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
48402 return DAG.getBitcast(VT, Res);
48403 }
48404 }
48405 }
48406
48407 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
48408 if (VT.is256BitVector() && Subtarget.hasInt256()) {
48409 SmallVector<int> Mask0, Mask1;
48410 SmallVector<SDValue> Ops0, Ops1;
48411 SmallVector<int, 2> ScaledMask0, ScaledMask1;
48412 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
48413 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
48414 !Ops0.empty() && !Ops1.empty() &&
48415 all_of(Ops0,
48416 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
48417 all_of(Ops1,
48418 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
48419 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
48420 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
48421 SDValue Op00 = peekThroughBitcasts(Ops0.front());
48422 SDValue Op10 = peekThroughBitcasts(Ops1.front());
48423 SDValue Op01 = peekThroughBitcasts(Ops0.back());
48424 SDValue Op11 = peekThroughBitcasts(Ops1.back());
48425 if ((Op00 == Op11) && (Op01 == Op10)) {
48426 std::swap(Op10, Op11);
48428 }
48429 if ((Op00 == Op10) && (Op01 == Op11)) {
48430 const int Map[4] = {0, 2, 1, 3};
48431 SmallVector<int, 4> ShuffleMask(
48432 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
48433 Map[ScaledMask1[1]]});
48434 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
48435 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
48436 DAG.getBitcast(SrcVT, Op01));
48437 Res = DAG.getBitcast(ShufVT, Res);
48438 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
48439 return DAG.getBitcast(VT, Res);
48440 }
48441 }
48442 }
48443
48444 return SDValue();
48445}
48446
48449 const X86Subtarget &Subtarget) {
48450 unsigned Opcode = N->getOpcode();
48451 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
48452 "Unexpected pack opcode");
48453
48454 EVT VT = N->getValueType(0);
48455 SDValue N0 = N->getOperand(0);
48456 SDValue N1 = N->getOperand(1);
48457 unsigned NumDstElts = VT.getVectorNumElements();
48458 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
48459 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
48460 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
48461 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
48462 "Unexpected PACKSS/PACKUS input type");
48463
48464 bool IsSigned = (X86ISD::PACKSS == Opcode);
48465
48466 // Constant Folding.
48467 APInt UndefElts0, UndefElts1;
48468 SmallVector<APInt, 32> EltBits0, EltBits1;
48469 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
48470 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
48471 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
48472 /*AllowWholeUndefs*/ true,
48473 /*AllowPartialUndefs*/ true) &&
48474 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
48475 /*AllowWholeUndefs*/ true,
48476 /*AllowPartialUndefs*/ true)) {
48477 unsigned NumLanes = VT.getSizeInBits() / 128;
48478 unsigned NumSrcElts = NumDstElts / 2;
48479 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
48480 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
48481
48482 APInt Undefs(NumDstElts, 0);
48483 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
48484 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
48485 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
48486 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
48487 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
48488 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
48489
48490 if (UndefElts[SrcIdx]) {
48491 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
48492 continue;
48493 }
48494
48495 APInt &Val = EltBits[SrcIdx];
48496 if (IsSigned) {
48497 // PACKSS: Truncate signed value with signed saturation.
48498 // Source values less than dst minint are saturated to minint.
48499 // Source values greater than dst maxint are saturated to maxint.
48500 Val = Val.truncSSat(DstBitsPerElt);
48501 } else {
48502 // PACKUS: Truncate signed value with unsigned saturation.
48503 // Source values less than zero are saturated to zero.
48504 // Source values greater than dst maxuint are saturated to maxuint.
48505 // NOTE: This is different from APInt::truncUSat.
48506 if (Val.isIntN(DstBitsPerElt))
48507 Val = Val.trunc(DstBitsPerElt);
48508 else if (Val.isNegative())
48509 Val = APInt::getZero(DstBitsPerElt);
48510 else
48511 Val = APInt::getAllOnes(DstBitsPerElt);
48512 }
48513 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
48514 }
48515 }
48516
48517 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
48518 }
48519
48520 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
48521 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
48522 return V;
48523
48524 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
48525 // Currently limit this to allsignbits cases only.
48526 if (IsSigned &&
48527 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
48528 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
48529 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
48530 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
48531 if (Not0 && Not1) {
48532 SDLoc DL(N);
48533 MVT SrcVT = N0.getSimpleValueType();
48534 SDValue Pack =
48535 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
48536 DAG.getBitcast(SrcVT, Not1));
48537 return DAG.getNOT(DL, Pack, VT);
48538 }
48539 }
48540
48541 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
48542 // truncate to create a larger truncate.
48543 if (Subtarget.hasAVX512() &&
48544 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
48545 N0.getOperand(0).getValueType() == MVT::v8i32) {
48546 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
48547 (!IsSigned &&
48548 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
48549 if (Subtarget.hasVLX())
48550 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
48551
48552 // Widen input to v16i32 so we can truncate that.
48553 SDLoc dl(N);
48554 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
48555 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
48556 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
48557 }
48558 }
48559
48560 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
48561 if (VT.is128BitVector()) {
48562 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48563 SDValue Src0, Src1;
48564 if (N0.getOpcode() == ExtOpc &&
48566 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48567 Src0 = N0.getOperand(0);
48568 }
48569 if (N1.getOpcode() == ExtOpc &&
48571 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48572 Src1 = N1.getOperand(0);
48573 }
48574 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
48575 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
48576 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
48577 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
48578 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
48579 }
48580
48581 // Try again with pack(*_extend_vector_inreg, undef).
48582 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
48584 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
48585 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
48586 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
48587 DAG);
48588 }
48589
48590 // Attempt to combine as shuffle.
48591 SDValue Op(N, 0);
48592 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48593 return Res;
48594
48595 return SDValue();
48596}
48597
48600 const X86Subtarget &Subtarget) {
48601 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
48602 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
48603 "Unexpected horizontal add/sub opcode");
48604
48605 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
48606 MVT VT = N->getSimpleValueType(0);
48607 SDValue LHS = N->getOperand(0);
48608 SDValue RHS = N->getOperand(1);
48609
48610 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
48611 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
48612 LHS.getOpcode() == RHS.getOpcode() &&
48613 LHS.getValueType() == RHS.getValueType() &&
48614 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
48615 SDValue LHS0 = LHS.getOperand(0);
48616 SDValue LHS1 = LHS.getOperand(1);
48617 SDValue RHS0 = RHS.getOperand(0);
48618 SDValue RHS1 = RHS.getOperand(1);
48619 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
48620 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
48621 SDLoc DL(N);
48622 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
48623 LHS0.isUndef() ? LHS1 : LHS0,
48624 RHS0.isUndef() ? RHS1 : RHS0);
48625 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
48626 Res = DAG.getBitcast(ShufVT, Res);
48627 SDValue NewLHS =
48628 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48629 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
48630 SDValue NewRHS =
48631 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48632 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
48633 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
48634 DAG.getBitcast(VT, NewRHS));
48635 }
48636 }
48637 }
48638
48639 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
48640 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
48641 return V;
48642
48643 return SDValue();
48644}
48645
48648 const X86Subtarget &Subtarget) {
48649 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
48650 X86ISD::VSRL == N->getOpcode()) &&
48651 "Unexpected shift opcode");
48652 EVT VT = N->getValueType(0);
48653 SDValue N0 = N->getOperand(0);
48654 SDValue N1 = N->getOperand(1);
48655
48656 // Shift zero -> zero.
48658 return DAG.getConstant(0, SDLoc(N), VT);
48659
48660 // Detect constant shift amounts.
48661 APInt UndefElts;
48662 SmallVector<APInt, 32> EltBits;
48663 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
48664 /*AllowWholeUndefs*/ true,
48665 /*AllowPartialUndefs*/ false)) {
48666 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
48667 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
48668 EltBits[0].getZExtValue(), DAG);
48669 }
48670
48671 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48672 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
48673 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
48674 return SDValue(N, 0);
48675
48676 return SDValue();
48677}
48678
48681 const X86Subtarget &Subtarget) {
48682 unsigned Opcode = N->getOpcode();
48683 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
48684 X86ISD::VSRLI == Opcode) &&
48685 "Unexpected shift opcode");
48686 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
48687 EVT VT = N->getValueType(0);
48688 SDValue N0 = N->getOperand(0);
48689 SDValue N1 = N->getOperand(1);
48690 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48691 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
48692 "Unexpected value type");
48693 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
48694
48695 // (shift undef, X) -> 0
48696 if (N0.isUndef())
48697 return DAG.getConstant(0, SDLoc(N), VT);
48698
48699 // Out of range logical bit shifts are guaranteed to be zero.
48700 // Out of range arithmetic bit shifts splat the sign bit.
48701 unsigned ShiftVal = N->getConstantOperandVal(1);
48702 if (ShiftVal >= NumBitsPerElt) {
48703 if (LogicalShift)
48704 return DAG.getConstant(0, SDLoc(N), VT);
48705 ShiftVal = NumBitsPerElt - 1;
48706 }
48707
48708 // (shift X, 0) -> X
48709 if (!ShiftVal)
48710 return N0;
48711
48712 // (shift 0, C) -> 0
48714 // N0 is all zeros or undef. We guarantee that the bits shifted into the
48715 // result are all zeros, not undef.
48716 return DAG.getConstant(0, SDLoc(N), VT);
48717
48718 // (VSRAI -1, C) -> -1
48719 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
48720 // N0 is all ones or undef. We guarantee that the bits shifted into the
48721 // result are all ones, not undef.
48722 return DAG.getConstant(-1, SDLoc(N), VT);
48723
48724 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
48725 unsigned NewShiftVal = Amt0 + Amt1;
48726 if (NewShiftVal >= NumBitsPerElt) {
48727 // Out of range logical bit shifts are guaranteed to be zero.
48728 // Out of range arithmetic bit shifts splat the sign bit.
48729 if (LogicalShift)
48730 return DAG.getConstant(0, SDLoc(N), VT);
48731 NewShiftVal = NumBitsPerElt - 1;
48732 }
48733 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
48734 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
48735 };
48736
48737 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
48738 if (Opcode == N0.getOpcode())
48739 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
48740
48741 // (shl (add X, X), C) -> (shl X, (C + 1))
48742 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
48743 N0.getOperand(0) == N0.getOperand(1))
48744 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
48745
48746 // We can decode 'whole byte' logical bit shifts as shuffles.
48747 if (LogicalShift && (ShiftVal % 8) == 0) {
48748 SDValue Op(N, 0);
48749 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48750 return Res;
48751 }
48752
48753 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
48754 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
48755 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
48756 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
48757 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
48758 N0.getOpcode() == X86ISD::PSHUFD &&
48759 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
48760 N0->hasOneUse()) {
48762 if (BC.getOpcode() == X86ISD::VSHLI &&
48763 BC.getScalarValueSizeInBits() == 64 &&
48764 BC.getConstantOperandVal(1) == 63) {
48765 SDLoc DL(N);
48766 SDValue Src = BC.getOperand(0);
48767 Src = DAG.getBitcast(VT, Src);
48768 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
48769 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
48770 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
48771 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
48772 return Src;
48773 }
48774 }
48775
48776 auto TryConstantFold = [&](SDValue V) {
48777 APInt UndefElts;
48778 SmallVector<APInt, 32> EltBits;
48779 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
48780 /*AllowWholeUndefs*/ true,
48781 /*AllowPartialUndefs*/ true))
48782 return SDValue();
48783 assert(EltBits.size() == VT.getVectorNumElements() &&
48784 "Unexpected shift value type");
48785 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
48786 // created an undef input due to no input bits being demanded, but user
48787 // still expects 0 in other bits.
48788 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
48789 APInt &Elt = EltBits[i];
48790 if (UndefElts[i])
48791 Elt = 0;
48792 else if (X86ISD::VSHLI == Opcode)
48793 Elt <<= ShiftVal;
48794 else if (X86ISD::VSRAI == Opcode)
48795 Elt.ashrInPlace(ShiftVal);
48796 else
48797 Elt.lshrInPlace(ShiftVal);
48798 }
48799 // Reset undef elements since they were zeroed above.
48800 UndefElts = 0;
48801 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
48802 };
48803
48804 // Constant Folding.
48805 if (N->isOnlyUserOf(N0.getNode())) {
48806 if (SDValue C = TryConstantFold(N0))
48807 return C;
48808
48809 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
48810 // Don't break NOT patterns.
48812 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
48813 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
48815 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
48816 SDLoc DL(N);
48817 SDValue LHS = DAG.getNode(Opcode, DL, VT,
48818 DAG.getBitcast(VT, BC.getOperand(0)), N1);
48819 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
48820 }
48821 }
48822 }
48823
48824 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48825 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
48826 DCI))
48827 return SDValue(N, 0);
48828
48829 return SDValue();
48830}
48831
48834 const X86Subtarget &Subtarget) {
48835 EVT VT = N->getValueType(0);
48836 unsigned Opcode = N->getOpcode();
48837 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
48838 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
48839 Opcode == ISD::INSERT_VECTOR_ELT) &&
48840 "Unexpected vector insertion");
48841
48842 SDValue Vec = N->getOperand(0);
48843 SDValue Scl = N->getOperand(1);
48844 SDValue Idx = N->getOperand(2);
48845
48846 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
48847 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
48848 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
48849
48850 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
48851 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48852 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48853 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
48854 APInt::getAllOnes(NumBitsPerElt), DCI))
48855 return SDValue(N, 0);
48856 }
48857
48858 // Attempt to combine insertion patterns to a shuffle.
48859 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
48860 SDValue Op(N, 0);
48861 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48862 return Res;
48863 }
48864
48865 return SDValue();
48866}
48867
48868/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
48869/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
48870/// OR -> CMPNEQSS.
48873 const X86Subtarget &Subtarget) {
48874 unsigned opcode;
48875
48876 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
48877 // we're requiring SSE2 for both.
48878 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
48879 SDValue N0 = N->getOperand(0);
48880 SDValue N1 = N->getOperand(1);
48881 SDValue CMP0 = N0.getOperand(1);
48882 SDValue CMP1 = N1.getOperand(1);
48883 SDLoc DL(N);
48884
48885 // The SETCCs should both refer to the same CMP.
48886 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
48887 return SDValue();
48888
48889 SDValue CMP00 = CMP0->getOperand(0);
48890 SDValue CMP01 = CMP0->getOperand(1);
48891 EVT VT = CMP00.getValueType();
48892
48893 if (VT == MVT::f32 || VT == MVT::f64 ||
48894 (VT == MVT::f16 && Subtarget.hasFP16())) {
48895 bool ExpectingFlags = false;
48896 // Check for any users that want flags:
48897 for (const SDNode *U : N->uses()) {
48898 if (ExpectingFlags)
48899 break;
48900
48901 switch (U->getOpcode()) {
48902 default:
48903 case ISD::BR_CC:
48904 case ISD::BRCOND:
48905 case ISD::SELECT:
48906 ExpectingFlags = true;
48907 break;
48908 case ISD::CopyToReg:
48909 case ISD::SIGN_EXTEND:
48910 case ISD::ZERO_EXTEND:
48911 case ISD::ANY_EXTEND:
48912 break;
48913 }
48914 }
48915
48916 if (!ExpectingFlags) {
48917 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
48918 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
48919
48920 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
48921 X86::CondCode tmp = cc0;
48922 cc0 = cc1;
48923 cc1 = tmp;
48924 }
48925
48926 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
48927 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
48928 // FIXME: need symbolic constants for these magic numbers.
48929 // See X86ATTInstPrinter.cpp:printSSECC().
48930 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
48931 if (Subtarget.hasAVX512()) {
48932 SDValue FSetCC =
48933 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
48934 DAG.getTargetConstant(x86cc, DL, MVT::i8));
48935 // Need to fill with zeros to ensure the bitcast will produce zeroes
48936 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
48937 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
48938 DAG.getConstant(0, DL, MVT::v16i1),
48939 FSetCC, DAG.getIntPtrConstant(0, DL));
48940 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
48941 N->getSimpleValueType(0));
48942 }
48943 SDValue OnesOrZeroesF =
48944 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
48945 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
48946
48947 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
48948 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
48949
48950 if (is64BitFP && !Subtarget.is64Bit()) {
48951 // On a 32-bit target, we cannot bitcast the 64-bit float to a
48952 // 64-bit integer, since that's not a legal type. Since
48953 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
48954 // bits, but can do this little dance to extract the lowest 32 bits
48955 // and work with those going forward.
48956 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
48957 OnesOrZeroesF);
48958 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
48959 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
48960 Vector32, DAG.getIntPtrConstant(0, DL));
48961 IntVT = MVT::i32;
48962 }
48963
48964 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
48965 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
48966 DAG.getConstant(1, DL, IntVT));
48967 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
48968 ANDed);
48969 return OneBitOfTruth;
48970 }
48971 }
48972 }
48973 }
48974 return SDValue();
48975}
48976
48977/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
48979 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48980
48981 MVT VT = N->getSimpleValueType(0);
48982 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
48983 return SDValue();
48984
48985 SDValue X, Y;
48986 SDValue N0 = N->getOperand(0);
48987 SDValue N1 = N->getOperand(1);
48988
48989 if (SDValue Not = IsNOT(N0, DAG)) {
48990 X = Not;
48991 Y = N1;
48992 } else if (SDValue Not = IsNOT(N1, DAG)) {
48993 X = Not;
48994 Y = N0;
48995 } else
48996 return SDValue();
48997
48998 X = DAG.getBitcast(VT, X);
48999 Y = DAG.getBitcast(VT, Y);
49000 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
49001}
49002
49003/// Try to fold:
49004/// and (vector_shuffle<Z,...,Z>
49005/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
49006/// ->
49007/// andnp (vector_shuffle<Z,...,Z>
49008/// (insert_vector_elt undef, X, Z), undef), Y
49010 const X86Subtarget &Subtarget) {
49011 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
49012
49013 EVT VT = N->getValueType(0);
49014 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
49015 // value and require extra moves.
49016 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
49017 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
49018 return SDValue();
49019
49020 auto GetNot = [&DAG](SDValue V) {
49021 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
49022 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
49023 // end-users are ISD::AND including cases
49024 // (and(extract_vector_element(SVN), Y)).
49025 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
49026 !SVN->getOperand(1).isUndef()) {
49027 return SDValue();
49028 }
49029 SDValue IVEN = SVN->getOperand(0);
49030 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
49031 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
49032 return SDValue();
49033 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
49034 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
49035 return SDValue();
49036 SDValue Src = IVEN.getOperand(1);
49037 if (SDValue Not = IsNOT(Src, DAG)) {
49038 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
49039 SDValue NotIVEN =
49041 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
49042 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
49043 SVN->getOperand(1), SVN->getMask());
49044 }
49045 return SDValue();
49046 };
49047
49048 SDValue X, Y;
49049 SDValue N0 = N->getOperand(0);
49050 SDValue N1 = N->getOperand(1);
49051 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49052
49053 if (SDValue Not = GetNot(N0)) {
49054 X = Not;
49055 Y = N1;
49056 } else if (SDValue Not = GetNot(N1)) {
49057 X = Not;
49058 Y = N0;
49059 } else
49060 return SDValue();
49061
49062 X = DAG.getBitcast(VT, X);
49063 Y = DAG.getBitcast(VT, Y);
49064 SDLoc DL(N);
49065
49066 // We do not split for SSE at all, but we need to split vectors for AVX1 and
49067 // AVX2.
49068 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
49070 SDValue LoX, HiX;
49071 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
49072 SDValue LoY, HiY;
49073 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
49074 EVT SplitVT = LoX.getValueType();
49075 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
49076 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
49077 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
49078 }
49079
49080 if (TLI.isTypeLegal(VT))
49081 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
49082
49083 return SDValue();
49084}
49085
49086// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
49087// logical operations, like in the example below.
49088// or (and (truncate x, truncate y)),
49089// (xor (truncate z, build_vector (constants)))
49090// Given a target type \p VT, we generate
49091// or (and x, y), (xor z, zext(build_vector (constants)))
49092// given x, y and z are of type \p VT. We can do so, if operands are either
49093// truncates from VT types, the second operand is a vector of constants or can
49094// be recursively promoted.
49096 SelectionDAG &DAG, unsigned Depth) {
49097 // Limit recursion to avoid excessive compile times.
49099 return SDValue();
49100
49101 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
49102 return SDValue();
49103
49104 SDValue N0 = N.getOperand(0);
49105 SDValue N1 = N.getOperand(1);
49106
49107 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49108 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
49109 return SDValue();
49110
49111 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
49112 N0 = NN0;
49113 else {
49114 // The left side has to be a trunc.
49115 if (N0.getOpcode() != ISD::TRUNCATE)
49116 return SDValue();
49117
49118 // The type of the truncated inputs.
49119 if (N0.getOperand(0).getValueType() != VT)
49120 return SDValue();
49121
49122 N0 = N0.getOperand(0);
49123 }
49124
49125 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
49126 N1 = NN1;
49127 else {
49128 // The right side has to be a 'trunc' or a (foldable) constant.
49129 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
49130 N1.getOperand(0).getValueType() == VT;
49131 if (RHSTrunc)
49132 N1 = N1.getOperand(0);
49133 else if (SDValue Cst =
49135 N1 = Cst;
49136 else
49137 return SDValue();
49138 }
49139
49140 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
49141}
49142
49143// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
49144// register. In most cases we actually compare or select YMM-sized registers
49145// and mixing the two types creates horrible code. This method optimizes
49146// some of the transition sequences.
49147// Even with AVX-512 this is still useful for removing casts around logical
49148// operations on vXi1 mask types.
49150 SelectionDAG &DAG,
49151 const X86Subtarget &Subtarget) {
49152 EVT VT = N.getValueType();
49153 assert(VT.isVector() && "Expected vector type");
49154 assert((N.getOpcode() == ISD::ANY_EXTEND ||
49155 N.getOpcode() == ISD::ZERO_EXTEND ||
49156 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
49157
49158 SDValue Narrow = N.getOperand(0);
49159 EVT NarrowVT = Narrow.getValueType();
49160
49161 // Generate the wide operation.
49162 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
49163 if (!Op)
49164 return SDValue();
49165 switch (N.getOpcode()) {
49166 default: llvm_unreachable("Unexpected opcode");
49167 case ISD::ANY_EXTEND:
49168 return Op;
49169 case ISD::ZERO_EXTEND:
49170 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
49171 case ISD::SIGN_EXTEND:
49172 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
49173 Op, DAG.getValueType(NarrowVT));
49174 }
49175}
49176
49177static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
49178 unsigned FPOpcode;
49179 switch (Opcode) {
49180 // clang-format off
49181 default: llvm_unreachable("Unexpected input node for FP logic conversion");
49182 case ISD::AND: FPOpcode = X86ISD::FAND; break;
49183 case ISD::OR: FPOpcode = X86ISD::FOR; break;
49184 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
49185 // clang-format on
49186 }
49187 return FPOpcode;
49188}
49189
49190/// If both input operands of a logic op are being cast from floating-point
49191/// types or FP compares, try to convert this into a floating-point logic node
49192/// to avoid unnecessary moves from SSE to integer registers.
49195 const X86Subtarget &Subtarget) {
49196 EVT VT = N->getValueType(0);
49197 SDValue N0 = N->getOperand(0);
49198 SDValue N1 = N->getOperand(1);
49199 SDLoc DL(N);
49200
49201 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
49202 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
49203 return SDValue();
49204
49205 SDValue N00 = N0.getOperand(0);
49206 SDValue N10 = N1.getOperand(0);
49207 EVT N00Type = N00.getValueType();
49208 EVT N10Type = N10.getValueType();
49209
49210 // Ensure that both types are the same and are legal scalar fp types.
49211 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
49212 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
49213 (Subtarget.hasFP16() && N00Type == MVT::f16)))
49214 return SDValue();
49215
49216 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
49217 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
49218 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
49219 return DAG.getBitcast(VT, FPLogic);
49220 }
49221
49222 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
49223 !N1.hasOneUse())
49224 return SDValue();
49225
49226 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
49227 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
49228
49229 // The vector ISA for FP predicates is incomplete before AVX, so converting
49230 // COMIS* to CMPS* may not be a win before AVX.
49231 if (!Subtarget.hasAVX() &&
49232 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
49233 return SDValue();
49234
49235 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
49236 // and vector logic:
49237 // logic (setcc N00, N01), (setcc N10, N11) -->
49238 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
49239 unsigned NumElts = 128 / N00Type.getSizeInBits();
49240 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
49241 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
49242 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
49243 SDValue N01 = N0.getOperand(1);
49244 SDValue N11 = N1.getOperand(1);
49245 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
49246 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
49247 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
49248 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
49249 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
49250 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
49251 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
49252 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
49253}
49254
49255// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
49256// to reduce XMM->GPR traffic.
49258 unsigned Opc = N->getOpcode();
49259 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
49260 "Unexpected bit opcode");
49261
49262 SDValue N0 = N->getOperand(0);
49263 SDValue N1 = N->getOperand(1);
49264
49265 // Both operands must be single use MOVMSK.
49266 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
49267 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
49268 return SDValue();
49269
49270 SDValue Vec0 = N0.getOperand(0);
49271 SDValue Vec1 = N1.getOperand(0);
49272 EVT VecVT0 = Vec0.getValueType();
49273 EVT VecVT1 = Vec1.getValueType();
49274
49275 // Both MOVMSK operands must be from vectors of the same size and same element
49276 // size, but its OK for a fp/int diff.
49277 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
49278 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
49279 return SDValue();
49280
49281 SDLoc DL(N);
49282 unsigned VecOpc =
49283 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
49284 SDValue Result =
49285 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
49286 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49287}
49288
49289// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
49290// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
49291// handles in InstCombine.
49293 unsigned Opc = N->getOpcode();
49294 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
49295 "Unexpected bit opcode");
49296
49297 SDValue N0 = N->getOperand(0);
49298 SDValue N1 = N->getOperand(1);
49299 EVT VT = N->getValueType(0);
49300
49301 // Both operands must be single use.
49302 if (!N0.hasOneUse() || !N1.hasOneUse())
49303 return SDValue();
49304
49305 // Search for matching shifts.
49308
49309 unsigned BCOpc = BC0.getOpcode();
49310 EVT BCVT = BC0.getValueType();
49311 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
49312 return SDValue();
49313
49314 switch (BCOpc) {
49315 case X86ISD::VSHLI:
49316 case X86ISD::VSRLI:
49317 case X86ISD::VSRAI: {
49318 if (BC0.getOperand(1) != BC1.getOperand(1))
49319 return SDValue();
49320
49321 SDLoc DL(N);
49322 SDValue BitOp =
49323 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
49324 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
49325 return DAG.getBitcast(VT, Shift);
49326 }
49327 }
49328
49329 return SDValue();
49330}
49331
49332// Attempt to fold:
49333// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
49334// TODO: Handle PACKUS handling.
49336 unsigned Opc = N->getOpcode();
49337 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
49338 "Unexpected bit opcode");
49339
49340 SDValue N0 = N->getOperand(0);
49341 SDValue N1 = N->getOperand(1);
49342 EVT VT = N->getValueType(0);
49343
49344 // Both operands must be single use.
49345 if (!N0.hasOneUse() || !N1.hasOneUse())
49346 return SDValue();
49347
49348 // Search for matching packs.
49351
49352 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
49353 return SDValue();
49354
49355 MVT DstVT = N0.getSimpleValueType();
49356 if (DstVT != N1.getSimpleValueType())
49357 return SDValue();
49358
49359 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
49360 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
49361
49362 // Limit to allsignbits packing.
49363 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
49364 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
49365 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
49366 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
49367 return SDValue();
49368
49369 SDLoc DL(N);
49370 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
49371 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
49372 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
49373}
49374
49375/// If this is a zero/all-bits result that is bitwise-anded with a low bits
49376/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
49377/// with a shift-right to eliminate loading the vector constant mask value.
49379 const X86Subtarget &Subtarget) {
49380 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
49381 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
49382 EVT VT = Op0.getValueType();
49383 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
49384 return SDValue();
49385
49386 // Try to convert an "is positive" signbit masking operation into arithmetic
49387 // shift and "andn". This saves a materialization of a -1 vector constant.
49388 // The "is negative" variant should be handled more generally because it only
49389 // requires "and" rather than "andn":
49390 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
49391 //
49392 // This is limited to the original type to avoid producing even more bitcasts.
49393 // If the bitcasts can't be eliminated, then it is unlikely that this fold
49394 // will be profitable.
49395 if (N->getValueType(0) == VT &&
49396 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
49397 SDValue X, Y;
49398 if (Op1.getOpcode() == X86ISD::PCMPGT &&
49399 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
49400 X = Op1.getOperand(0);
49401 Y = Op0;
49402 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
49403 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
49404 X = Op0.getOperand(0);
49405 Y = Op1;
49406 }
49407 if (X && Y) {
49408 SDLoc DL(N);
49409 SDValue Sra =
49411 VT.getScalarSizeInBits() - 1, DAG);
49412 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
49413 }
49414 }
49415
49416 APInt SplatVal;
49417 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
49418 return SDValue();
49419
49420 // Don't prevent creation of ANDN.
49421 if (isBitwiseNot(Op0))
49422 return SDValue();
49423
49424 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
49425 return SDValue();
49426
49427 unsigned EltBitWidth = VT.getScalarSizeInBits();
49428 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
49429 return SDValue();
49430
49431 SDLoc DL(N);
49432 unsigned ShiftVal = SplatVal.countr_one();
49433 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
49434 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
49435 return DAG.getBitcast(N->getValueType(0), Shift);
49436}
49437
49438// Get the index node from the lowered DAG of a GEP IR instruction with one
49439// indexing dimension.
49441 if (Ld->isIndexed())
49442 return SDValue();
49443
49444 SDValue Base = Ld->getBasePtr();
49445
49446 if (Base.getOpcode() != ISD::ADD)
49447 return SDValue();
49448
49449 SDValue ShiftedIndex = Base.getOperand(0);
49450
49451 if (ShiftedIndex.getOpcode() != ISD::SHL)
49452 return SDValue();
49453
49454 return ShiftedIndex.getOperand(0);
49455
49456}
49457
49458static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
49459 return Subtarget.hasBMI2() &&
49460 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
49461}
49462
49463// This function recognizes cases where X86 bzhi instruction can replace and
49464// 'and-load' sequence.
49465// In case of loading integer value from an array of constants which is defined
49466// as follows:
49467//
49468// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
49469//
49470// then applying a bitwise and on the result with another input.
49471// It's equivalent to performing bzhi (zero high bits) on the input, with the
49472// same index of the load.
49474 const X86Subtarget &Subtarget) {
49475 MVT VT = Node->getSimpleValueType(0);
49476 SDLoc dl(Node);
49477
49478 // Check if subtarget has BZHI instruction for the node's type
49479 if (!hasBZHI(Subtarget, VT))
49480 return SDValue();
49481
49482 // Try matching the pattern for both operands.
49483 for (unsigned i = 0; i < 2; i++) {
49484 SDValue N = Node->getOperand(i);
49485 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
49486
49487 // continue if the operand is not a load instruction
49488 if (!Ld)
49489 return SDValue();
49490
49491 const Value *MemOp = Ld->getMemOperand()->getValue();
49492
49493 if (!MemOp)
49494 return SDValue();
49495
49496 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
49497 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
49498 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
49499
49500 Constant *Init = GV->getInitializer();
49501 Type *Ty = Init->getType();
49502 if (!isa<ConstantDataArray>(Init) ||
49503 !Ty->getArrayElementType()->isIntegerTy() ||
49505 VT.getSizeInBits() ||
49506 Ty->getArrayNumElements() >
49508 continue;
49509
49510 // Check if the array's constant elements are suitable to our case.
49511 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
49512 bool ConstantsMatch = true;
49513 for (uint64_t j = 0; j < ArrayElementCount; j++) {
49514 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
49515 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
49516 ConstantsMatch = false;
49517 break;
49518 }
49519 }
49520 if (!ConstantsMatch)
49521 continue;
49522
49523 // Do the transformation (For 32-bit type):
49524 // -> (and (load arr[idx]), inp)
49525 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
49526 // that will be replaced with one bzhi instruction.
49527 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
49528 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
49529
49530 // Get the Node which indexes into the array.
49532 if (!Index)
49533 return SDValue();
49534 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
49535
49536 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
49537 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
49538
49539 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
49540 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
49541
49542 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
49543 }
49544 }
49545 }
49546 }
49547 return SDValue();
49548}
49549
49550// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
49551// Where C is a mask containing the same number of bits as the setcc and
49552// where the setcc will freely 0 upper bits of k-register. We can replace the
49553// undef in the concat with 0s and remove the AND. This mainly helps with
49554// v2i1/v4i1 setcc being casted to scalar.
49556 const X86Subtarget &Subtarget) {
49557 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
49558
49559 EVT VT = N->getValueType(0);
49560
49561 // Make sure this is an AND with constant. We will check the value of the
49562 // constant later.
49563 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
49564 if (!C1)
49565 return SDValue();
49566
49567 // This is implied by the ConstantSDNode.
49568 assert(!VT.isVector() && "Expected scalar VT!");
49569
49570 SDValue Src = N->getOperand(0);
49571 if (!Src.hasOneUse())
49572 return SDValue();
49573
49574 // (Optionally) peek through any_extend().
49575 if (Src.getOpcode() == ISD::ANY_EXTEND) {
49576 if (!Src.getOperand(0).hasOneUse())
49577 return SDValue();
49578 Src = Src.getOperand(0);
49579 }
49580
49581 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
49582 return SDValue();
49583
49584 Src = Src.getOperand(0);
49585 EVT SrcVT = Src.getValueType();
49586
49587 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49588 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
49589 !TLI.isTypeLegal(SrcVT))
49590 return SDValue();
49591
49592 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
49593 return SDValue();
49594
49595 // We only care about the first subvector of the concat, we expect the
49596 // other subvectors to be ignored due to the AND if we make the change.
49597 SDValue SubVec = Src.getOperand(0);
49598 EVT SubVecVT = SubVec.getValueType();
49599
49600 // The RHS of the AND should be a mask with as many bits as SubVec.
49601 if (!TLI.isTypeLegal(SubVecVT) ||
49602 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
49603 return SDValue();
49604
49605 // First subvector should be a setcc with a legal result type or a
49606 // AND containing at least one setcc with a legal result type.
49607 auto IsLegalSetCC = [&](SDValue V) {
49608 if (V.getOpcode() != ISD::SETCC)
49609 return false;
49610 EVT SetccVT = V.getOperand(0).getValueType();
49611 if (!TLI.isTypeLegal(SetccVT) ||
49612 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
49613 return false;
49614 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
49615 return false;
49616 return true;
49617 };
49618 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
49619 (IsLegalSetCC(SubVec.getOperand(0)) ||
49620 IsLegalSetCC(SubVec.getOperand(1))))))
49621 return SDValue();
49622
49623 // We passed all the checks. Rebuild the concat_vectors with zeroes
49624 // and cast it back to VT.
49625 SDLoc dl(N);
49626 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
49627 DAG.getConstant(0, dl, SubVecVT));
49628 Ops[0] = SubVec;
49629 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
49630 Ops);
49631 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
49632 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
49633}
49634
49635static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
49636 SDValue OpMustEq, SDValue Op, unsigned Depth) {
49637 // We don't want to go crazy with the recursion here. This isn't a super
49638 // important optimization.
49639 static constexpr unsigned kMaxDepth = 2;
49640
49641 // Only do this re-ordering if op has one use.
49642 if (!Op.hasOneUse())
49643 return SDValue();
49644
49645 SDLoc DL(Op);
49646 // If we hit another assosiative op, recurse further.
49647 if (Op.getOpcode() == Opc) {
49648 // Done recursing.
49649 if (Depth++ >= kMaxDepth)
49650 return SDValue();
49651
49652 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49653 if (SDValue R =
49654 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
49655 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
49656 Op.getOperand(1 - OpIdx));
49657
49658 } else if (Op.getOpcode() == ISD::SUB) {
49659 if (Opc == ISD::AND) {
49660 // BLSI: (and x, (sub 0, x))
49661 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
49662 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49663 }
49664 // Opc must be ISD::AND or ISD::XOR
49665 // BLSR: (and x, (sub x, 1))
49666 // BLSMSK: (xor x, (sub x, 1))
49667 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
49668 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49669
49670 } else if (Op.getOpcode() == ISD::ADD) {
49671 // Opc must be ISD::AND or ISD::XOR
49672 // BLSR: (and x, (add x, -1))
49673 // BLSMSK: (xor x, (add x, -1))
49674 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
49675 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49676 }
49677 return SDValue();
49678}
49679
49681 const X86Subtarget &Subtarget) {
49682 EVT VT = N->getValueType(0);
49683 // Make sure this node is a candidate for BMI instructions.
49684 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
49685 (VT != MVT::i32 && VT != MVT::i64))
49686 return SDValue();
49687
49688 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
49689
49690 // Try and match LHS and RHS.
49691 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49692 if (SDValue OpMatch =
49693 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
49694 N->getOperand(1 - OpIdx), 0))
49695 return OpMatch;
49696 return SDValue();
49697}
49698
49700 SelectionDAG &DAG,
49702 const X86Subtarget &ST) {
49703 // cmp(setcc(cc, X), 0)
49704 // brcond ne
49705 // ->
49706 // X
49707 // brcond cc
49708
49709 // sub(setcc(cc, X), 1)
49710 // brcond ne
49711 // ->
49712 // X
49713 // brcond ~cc
49714 //
49715 // if only flag has users
49716
49717 SDValue SetCC = N->getOperand(0);
49718
49719 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
49720 return SDValue();
49721
49722 // Check the only user of flag is `brcond ne`.
49723 SDNode *BrCond = *Flag->uses().begin();
49724 if (BrCond->getOpcode() != X86ISD::BRCOND)
49725 return SDValue();
49726 unsigned CondNo = 2;
49727 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
49729 return SDValue();
49730
49731 SDValue X = SetCC.getOperand(1);
49732 // sub has two results while X only have one. DAG combine assumes the value
49733 // type matches.
49734 if (N->getOpcode() == X86ISD::SUB)
49735 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
49736
49737 SDValue CCN = SetCC.getOperand(0);
49739 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
49741 // Update CC for the consumer of the flag.
49742 // The old CC is `ne`. Hence, when comparing the result with 0, we are
49743 // checking if the second condition evaluates to true. When comparing the
49744 // result with 1, we are checking uf the second condition evaluates to false.
49745 SmallVector<SDValue> Ops(BrCond->op_values());
49746 if (isNullConstant(N->getOperand(1)))
49747 Ops[CondNo] = CCN;
49748 else if (isOneConstant(N->getOperand(1)))
49749 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
49750 else
49751 llvm_unreachable("expect constant 0 or 1");
49752
49753 SDValue NewBrCond =
49754 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
49755 // Avoid self-assign error b/c CC1 can be `e/ne`.
49756 if (BrCond != NewBrCond.getNode())
49757 DCI.CombineTo(BrCond, NewBrCond);
49758 return X;
49759}
49760
49763 const X86Subtarget &ST) {
49764 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
49765 // ->
49766 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
49767
49768 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
49769 // ->
49770 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
49771 //
49772 // where cflags is determined by cc1.
49773
49774 if (!ST.hasCCMP())
49775 return SDValue();
49776
49777 SDValue SetCC0 = N->getOperand(0);
49778 SDValue SetCC1 = N->getOperand(1);
49779 if (SetCC0.getOpcode() != X86ISD::SETCC ||
49780 SetCC1.getOpcode() != X86ISD::SETCC)
49781 return SDValue();
49782
49783 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
49784 SDValue Op = V.getOperand(1);
49785 unsigned Opc = Op.getOpcode();
49786 if (Opc == X86ISD::SUB)
49787 return X86ISD::CCMP;
49788 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
49789 return X86ISD::CTEST;
49790 return 0U;
49791 };
49792
49793 unsigned NewOpc = 0;
49794
49795 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
49796 // appear on the right.
49797 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
49798 std::swap(SetCC0, SetCC1);
49799 if (!(NewOpc = GetCombineToOpc(SetCC1)))
49800 return SDValue();
49801 }
49802
49803 X86::CondCode CC0 =
49804 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
49805 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
49806 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
49807 return SDValue();
49808
49809 bool IsOR = N->getOpcode() == ISD::OR;
49810
49811 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
49812 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
49813 // operator is OR. Similar for CC1.
49814 SDValue SrcCC =
49816 SDLoc(SetCC0.getOperand(0)), MVT::i8)
49817 : SetCC0.getOperand(0);
49818 SDValue CC1N = SetCC1.getOperand(0);
49819 X86::CondCode CC1 =
49820 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
49822 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
49823 SDLoc DL(N);
49824 SDValue CFlags = DAG.getTargetConstant(
49825 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
49826 SDValue Sub = SetCC1.getOperand(1);
49827
49828 // Replace any uses of the old flag produced by SUB/CMP with the new one
49829 // produced by CCMP/CTEST.
49830 SDValue CCMP = (NewOpc == X86ISD::CCMP)
49831 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
49832 {Sub.getOperand(0), Sub.getOperand(1),
49833 CFlags, SrcCC, SetCC0.getOperand(1)})
49834 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
49835 {Sub.getOperand(0), Sub.getOperand(0),
49836 CFlags, SrcCC, SetCC0.getOperand(1)});
49837
49838 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
49839}
49840
49843 const X86Subtarget &Subtarget) {
49844 SDValue N0 = N->getOperand(0);
49845 SDValue N1 = N->getOperand(1);
49846 EVT VT = N->getValueType(0);
49847 SDLoc dl(N);
49848 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49849
49850 // If this is SSE1 only convert to FAND to avoid scalarization.
49851 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49852 return DAG.getBitcast(MVT::v4i32,
49853 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
49854 DAG.getBitcast(MVT::v4f32, N0),
49855 DAG.getBitcast(MVT::v4f32, N1)));
49856 }
49857
49858 // Use a 32-bit and+zext if upper bits known zero.
49859 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
49860 APInt HiMask = APInt::getHighBitsSet(64, 32);
49861 if (DAG.MaskedValueIsZero(N1, HiMask) ||
49862 DAG.MaskedValueIsZero(N0, HiMask)) {
49863 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
49864 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
49865 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
49866 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
49867 }
49868 }
49869
49870 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
49871 // TODO: Support multiple SrcOps.
49872 if (VT == MVT::i1) {
49874 SmallVector<APInt, 2> SrcPartials;
49875 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
49876 SrcOps.size() == 1) {
49877 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49878 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49879 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49880 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49881 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49882 if (Mask) {
49883 assert(SrcPartials[0].getBitWidth() == NumElts &&
49884 "Unexpected partial reduction mask");
49885 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49886 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49887 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
49888 }
49889 }
49890 }
49891
49892 // InstCombine converts:
49893 // `(-x << C0) & C1`
49894 // to
49895 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
49896 // This saves an IR instruction but on x86 the neg/shift version is preferable
49897 // so undo the transform.
49898
49899 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
49900 // TODO: We don't actually need a splat for this, we just need the checks to
49901 // hold for each element.
49902 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
49903 /*AllowTruncation*/ false);
49904 ConstantSDNode *N01C =
49905 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
49906 /*AllowTruncation*/ false);
49907 if (N1C && N01C) {
49908 const APInt &MulC = N01C->getAPIntValue();
49909 const APInt &AndC = N1C->getAPIntValue();
49910 APInt MulCLowBit = MulC & (-MulC);
49911 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
49912 (MulCLowBit + MulC).isPowerOf2()) {
49913 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
49914 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
49915 assert(MulCLowBitLog != -1 &&
49916 "Isolated lowbit is somehow not a power of 2!");
49917 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
49918 DAG.getConstant(MulCLowBitLog, dl, VT));
49919 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
49920 }
49921 }
49922 }
49923
49924 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
49925 return SetCC;
49926
49927 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
49928 return V;
49929
49930 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49931 return R;
49932
49933 if (SDValue R = combineBitOpWithShift(N, DAG))
49934 return R;
49935
49936 if (SDValue R = combineBitOpWithPACK(N, DAG))
49937 return R;
49938
49939 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49940 return FPLogic;
49941
49942 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
49943 return R;
49944
49945 if (DCI.isBeforeLegalizeOps())
49946 return SDValue();
49947
49948 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49949 return R;
49950
49951 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
49952 return R;
49953
49954 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
49955 return ShiftRight;
49956
49957 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
49958 return R;
49959
49960 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
49961 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
49962 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
49963 if (VT.isVector() && getTargetConstantFromNode(N1)) {
49964 unsigned Opc0 = N0.getOpcode();
49965 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
49967 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
49968 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
49969 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
49970 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
49971 }
49972 }
49973
49974 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
49975 // avoids slow variable shift (moving shift amount to ECX etc.)
49976 if (isOneConstant(N1) && N0->hasOneUse()) {
49977 SDValue Src = N0;
49978 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
49979 Src.getOpcode() == ISD::TRUNCATE) &&
49980 Src.getOperand(0)->hasOneUse())
49981 Src = Src.getOperand(0);
49982 bool ContainsNOT = false;
49983 X86::CondCode X86CC = X86::COND_B;
49984 // Peek through AND(NOT(SRL(X,Y)),1).
49985 if (isBitwiseNot(Src)) {
49986 Src = Src.getOperand(0);
49987 X86CC = X86::COND_AE;
49988 ContainsNOT = true;
49989 }
49990 if (Src.getOpcode() == ISD::SRL &&
49991 !isa<ConstantSDNode>(Src.getOperand(1))) {
49992 SDValue BitNo = Src.getOperand(1);
49993 Src = Src.getOperand(0);
49994 // Peek through AND(SRL(NOT(X),Y),1).
49995 if (isBitwiseNot(Src)) {
49996 Src = Src.getOperand(0);
49997 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
49998 ContainsNOT = true;
49999 }
50000 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
50001 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
50002 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
50003 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
50004 }
50005 }
50006
50007 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50008 // Attempt to recursively combine a bitmask AND with shuffles.
50009 SDValue Op(N, 0);
50010 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50011 return Res;
50012
50013 // If either operand is a constant mask, then only the elements that aren't
50014 // zero are actually demanded by the other operand.
50015 auto GetDemandedMasks = [&](SDValue Op) {
50016 APInt UndefElts;
50017 SmallVector<APInt> EltBits;
50018 int NumElts = VT.getVectorNumElements();
50019 int EltSizeInBits = VT.getScalarSizeInBits();
50020 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
50021 APInt DemandedElts = APInt::getAllOnes(NumElts);
50022 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
50023 EltBits)) {
50024 DemandedBits.clearAllBits();
50025 DemandedElts.clearAllBits();
50026 for (int I = 0; I != NumElts; ++I) {
50027 if (UndefElts[I]) {
50028 // We can't assume an undef src element gives an undef dst - the
50029 // other src might be zero.
50030 DemandedBits.setAllBits();
50031 DemandedElts.setBit(I);
50032 } else if (!EltBits[I].isZero()) {
50033 DemandedBits |= EltBits[I];
50034 DemandedElts.setBit(I);
50035 }
50036 }
50037 }
50038 return std::make_pair(DemandedBits, DemandedElts);
50039 };
50040 APInt Bits0, Elts0;
50041 APInt Bits1, Elts1;
50042 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
50043 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
50044
50045 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
50046 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
50047 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
50048 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
50049 if (N->getOpcode() != ISD::DELETED_NODE)
50050 DCI.AddToWorklist(N);
50051 return SDValue(N, 0);
50052 }
50053
50054 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
50055 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
50056 if (NewN0 || NewN1)
50057 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
50058 NewN1 ? NewN1 : N1);
50059 }
50060
50061 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
50062 if ((VT.getScalarSizeInBits() % 8) == 0 &&
50064 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
50065 SDValue BitMask = N1;
50066 SDValue SrcVec = N0.getOperand(0);
50067 EVT SrcVecVT = SrcVec.getValueType();
50068
50069 // Check that the constant bitmask masks whole bytes.
50070 APInt UndefElts;
50071 SmallVector<APInt, 64> EltBits;
50072 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
50073 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
50074 llvm::all_of(EltBits, [](const APInt &M) {
50075 return M.isZero() || M.isAllOnes();
50076 })) {
50077 unsigned NumElts = SrcVecVT.getVectorNumElements();
50078 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
50079 unsigned Idx = N0.getConstantOperandVal(1);
50080
50081 // Create a root shuffle mask from the byte mask and the extracted index.
50082 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
50083 for (unsigned i = 0; i != Scale; ++i) {
50084 if (UndefElts[i])
50085 continue;
50086 int VecIdx = Scale * Idx + i;
50087 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
50088 }
50089
50091 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
50093 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
50094 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
50095 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
50096 N0.getOperand(1));
50097 }
50098 }
50099
50100 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
50101 return R;
50102
50103 return SDValue();
50104}
50105
50106// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
50108 const X86Subtarget &Subtarget) {
50109 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
50110
50111 MVT VT = N->getSimpleValueType(0);
50112 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50113 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
50114 return SDValue();
50115
50116 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
50117 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
50118 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
50119 return SDValue();
50120
50121 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
50122 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
50123 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
50124 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
50125 return SDValue();
50126
50127 // Attempt to extract constant byte masks.
50128 APInt UndefElts0, UndefElts1;
50129 SmallVector<APInt, 32> EltBits0, EltBits1;
50130 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
50131 /*AllowWholeUndefs*/ false,
50132 /*AllowPartialUndefs*/ false))
50133 return SDValue();
50134 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
50135 /*AllowWholeUndefs*/ false,
50136 /*AllowPartialUndefs*/ false))
50137 return SDValue();
50138
50139 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
50140 // TODO - add UNDEF elts support.
50141 if (UndefElts0[i] || UndefElts1[i])
50142 return SDValue();
50143 if (EltBits0[i] != ~EltBits1[i])
50144 return SDValue();
50145 }
50146
50147 SDLoc DL(N);
50148
50149 if (useVPTERNLOG(Subtarget, VT)) {
50150 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
50151 // VPTERNLOG is only available as vXi32/64-bit types.
50152 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
50153 MVT OpVT =
50154 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
50155 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
50156 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
50157 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
50158 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
50159 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
50160 DAG, Subtarget);
50161 return DAG.getBitcast(VT, Res);
50162 }
50163
50164 SDValue X = N->getOperand(0);
50165 SDValue Y =
50166 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
50167 DAG.getBitcast(VT, N1.getOperand(0)));
50168 return DAG.getNode(ISD::OR, DL, VT, X, Y);
50169}
50170
50171// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
50172static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
50173 if (N->getOpcode() != ISD::OR)
50174 return false;
50175
50176 SDValue N0 = N->getOperand(0);
50177 SDValue N1 = N->getOperand(1);
50178
50179 // Canonicalize AND to LHS.
50180 if (N1.getOpcode() == ISD::AND)
50181 std::swap(N0, N1);
50182
50183 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
50184 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
50185 return false;
50186
50187 Mask = N1.getOperand(0);
50188 X = N1.getOperand(1);
50189
50190 // Check to see if the mask appeared in both the AND and ANDNP.
50191 if (N0.getOperand(0) == Mask)
50192 Y = N0.getOperand(1);
50193 else if (N0.getOperand(1) == Mask)
50194 Y = N0.getOperand(0);
50195 else
50196 return false;
50197
50198 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
50199 // ANDNP combine allows other combines to happen that prevent matching.
50200 return true;
50201}
50202
50203// Try to fold:
50204// (or (and (m, y), (pandn m, x)))
50205// into:
50206// (vselect m, x, y)
50207// As a special case, try to fold:
50208// (or (and (m, (sub 0, x)), (pandn m, x)))
50209// into:
50210// (sub (xor X, M), M)
50212 const X86Subtarget &Subtarget) {
50213 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
50214
50215 EVT VT = N->getValueType(0);
50216 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50217 (VT.is256BitVector() && Subtarget.hasInt256())))
50218 return SDValue();
50219
50220 SDValue X, Y, Mask;
50221 if (!matchLogicBlend(N, X, Y, Mask))
50222 return SDValue();
50223
50224 // Validate that X, Y, and Mask are bitcasts, and see through them.
50225 Mask = peekThroughBitcasts(Mask);
50228
50229 EVT MaskVT = Mask.getValueType();
50230 unsigned EltBits = MaskVT.getScalarSizeInBits();
50231
50232 // TODO: Attempt to handle floating point cases as well?
50233 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
50234 return SDValue();
50235
50236 SDLoc DL(N);
50237
50238 // Attempt to combine to conditional negate: (sub (xor X, M), M)
50239 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
50240 DAG, Subtarget))
50241 return Res;
50242
50243 // PBLENDVB is only available on SSE 4.1.
50244 if (!Subtarget.hasSSE41())
50245 return SDValue();
50246
50247 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
50248 if (Subtarget.hasVLX())
50249 return SDValue();
50250
50251 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
50252
50253 X = DAG.getBitcast(BlendVT, X);
50254 Y = DAG.getBitcast(BlendVT, Y);
50255 Mask = DAG.getBitcast(BlendVT, Mask);
50256 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
50257 return DAG.getBitcast(VT, Mask);
50258}
50259
50260// Helper function for combineOrCmpEqZeroToCtlzSrl
50261// Transforms:
50262// seteq(cmp x, 0)
50263// into:
50264// srl(ctlz x), log2(bitsize(x))
50265// Input pattern is checked by caller.
50267 SDValue Cmp = Op.getOperand(1);
50268 EVT VT = Cmp.getOperand(0).getValueType();
50269 unsigned Log2b = Log2_32(VT.getSizeInBits());
50270 SDLoc dl(Op);
50271 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
50272 // The result of the shift is true or false, and on X86, the 32-bit
50273 // encoding of shr and lzcnt is more desirable.
50274 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
50275 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
50276 DAG.getConstant(Log2b, dl, MVT::i8));
50277 return Scc;
50278}
50279
50280// Try to transform:
50281// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
50282// into:
50283// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
50284// Will also attempt to match more generic cases, eg:
50285// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
50286// Only applies if the target supports the FastLZCNT feature.
50289 const X86Subtarget &Subtarget) {
50290 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
50291 return SDValue();
50292
50293 auto isORCandidate = [](SDValue N) {
50294 return (N->getOpcode() == ISD::OR && N->hasOneUse());
50295 };
50296
50297 // Check the zero extend is extending to 32-bit or more. The code generated by
50298 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
50299 // instructions to clear the upper bits.
50300 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
50301 !isORCandidate(N->getOperand(0)))
50302 return SDValue();
50303
50304 // Check the node matches: setcc(eq, cmp 0)
50305 auto isSetCCCandidate = [](SDValue N) {
50306 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
50307 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
50308 N->getOperand(1).getOpcode() == X86ISD::CMP &&
50309 isNullConstant(N->getOperand(1).getOperand(1)) &&
50310 N->getOperand(1).getValueType().bitsGE(MVT::i32);
50311 };
50312
50313 SDNode *OR = N->getOperand(0).getNode();
50314 SDValue LHS = OR->getOperand(0);
50315 SDValue RHS = OR->getOperand(1);
50316
50317 // Save nodes matching or(or, setcc(eq, cmp 0)).
50319 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
50320 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
50321 ORNodes.push_back(OR);
50322 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
50323 LHS = OR->getOperand(0);
50324 RHS = OR->getOperand(1);
50325 }
50326
50327 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
50328 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
50329 !isORCandidate(SDValue(OR, 0)))
50330 return SDValue();
50331
50332 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
50333 // to
50334 // or(srl(ctlz),srl(ctlz)).
50335 // The dag combiner can then fold it into:
50336 // srl(or(ctlz, ctlz)).
50337 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
50338 SDValue Ret, NewRHS;
50339 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
50340 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
50341
50342 if (!Ret)
50343 return SDValue();
50344
50345 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
50346 while (!ORNodes.empty()) {
50347 OR = ORNodes.pop_back_val();
50348 LHS = OR->getOperand(0);
50349 RHS = OR->getOperand(1);
50350 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
50351 if (RHS->getOpcode() == ISD::OR)
50352 std::swap(LHS, RHS);
50353 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
50354 if (!NewRHS)
50355 return SDValue();
50356 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
50357 }
50358
50359 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
50360}
50361
50363 SDValue And1_L, SDValue And1_R,
50364 const SDLoc &DL, SelectionDAG &DAG) {
50365 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
50366 return SDValue();
50367 SDValue NotOp = And0_L->getOperand(0);
50368 if (NotOp == And1_R)
50369 std::swap(And1_R, And1_L);
50370 if (NotOp != And1_L)
50371 return SDValue();
50372
50373 // (~(NotOp) & And0_R) | (NotOp & And1_R)
50374 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
50375 EVT VT = And1_L->getValueType(0);
50376 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
50377 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
50378 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
50379 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
50380 return Xor1;
50381}
50382
50383/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
50384/// equivalent `((x ^ y) & m) ^ y)` pattern.
50385/// This is typically a better representation for targets without a fused
50386/// "and-not" operation. This function is intended to be called from a
50387/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
50389 // Note that masked-merge variants using XOR or ADD expressions are
50390 // normalized to OR by InstCombine so we only check for OR.
50391 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
50392 SDValue N0 = Node->getOperand(0);
50393 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
50394 return SDValue();
50395 SDValue N1 = Node->getOperand(1);
50396 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
50397 return SDValue();
50398
50399 SDLoc DL(Node);
50400 SDValue N00 = N0->getOperand(0);
50401 SDValue N01 = N0->getOperand(1);
50402 SDValue N10 = N1->getOperand(0);
50403 SDValue N11 = N1->getOperand(1);
50404 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
50405 return Result;
50406 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
50407 return Result;
50408 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
50409 return Result;
50410 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
50411 return Result;
50412 return SDValue();
50413}
50414
50415/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50416/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50417/// with CMP+{ADC, SBB}.
50418/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
50419static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
50420 SDValue X, SDValue Y,
50421 SelectionDAG &DAG,
50422 bool ZeroSecondOpOnly = false) {
50423 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
50424 return SDValue();
50425
50426 // Look through a one-use zext.
50427 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
50428 Y = Y.getOperand(0);
50429
50431 SDValue EFLAGS;
50432 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
50433 CC = (X86::CondCode)Y.getConstantOperandVal(0);
50434 EFLAGS = Y.getOperand(1);
50435 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
50436 Y.hasOneUse()) {
50437 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
50438 }
50439
50440 if (!EFLAGS)
50441 return SDValue();
50442
50443 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50444 // the general case below.
50445 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
50446 if (ConstantX && !ZeroSecondOpOnly) {
50447 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
50448 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
50449 // This is a complicated way to get -1 or 0 from the carry flag:
50450 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50451 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50452 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50453 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50454 EFLAGS);
50455 }
50456
50457 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
50458 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
50459 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
50460 EFLAGS.getValueType().isInteger() &&
50461 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50462 // Swap the operands of a SUB, and we have the same pattern as above.
50463 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
50464 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
50465 SDValue NewSub = DAG.getNode(
50466 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50467 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50468 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
50469 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50470 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50471 NewEFLAGS);
50472 }
50473 }
50474 }
50475
50476 if (CC == X86::COND_B) {
50477 // X + SETB Z --> adc X, 0
50478 // X - SETB Z --> sbb X, 0
50479 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50480 DAG.getVTList(VT, MVT::i32), X,
50481 DAG.getConstant(0, DL, VT), EFLAGS);
50482 }
50483
50484 if (ZeroSecondOpOnly)
50485 return SDValue();
50486
50487 if (CC == X86::COND_A) {
50488 // Try to convert COND_A into COND_B in an attempt to facilitate
50489 // materializing "setb reg".
50490 //
50491 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
50492 // cannot take an immediate as its first operand.
50493 //
50494 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50495 EFLAGS.getValueType().isInteger() &&
50496 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50497 SDValue NewSub =
50498 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50499 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50500 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50501 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50502 DAG.getVTList(VT, MVT::i32), X,
50503 DAG.getConstant(0, DL, VT), NewEFLAGS);
50504 }
50505 }
50506
50507 if (CC == X86::COND_AE) {
50508 // X + SETAE --> sbb X, -1
50509 // X - SETAE --> adc X, -1
50510 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50511 DAG.getVTList(VT, MVT::i32), X,
50512 DAG.getConstant(-1, DL, VT), EFLAGS);
50513 }
50514
50515 if (CC == X86::COND_BE) {
50516 // X + SETBE --> sbb X, -1
50517 // X - SETBE --> adc X, -1
50518 // Try to convert COND_BE into COND_AE in an attempt to facilitate
50519 // materializing "setae reg".
50520 //
50521 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
50522 // cannot take an immediate as its first operand.
50523 //
50524 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50525 EFLAGS.getValueType().isInteger() &&
50526 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50527 SDValue NewSub =
50528 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50529 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50530 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50531 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50532 DAG.getVTList(VT, MVT::i32), X,
50533 DAG.getConstant(-1, DL, VT), NewEFLAGS);
50534 }
50535 }
50536
50537 if (CC != X86::COND_E && CC != X86::COND_NE)
50538 return SDValue();
50539
50540 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
50541 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
50542 !EFLAGS.getOperand(0).getValueType().isInteger())
50543 return SDValue();
50544
50545 SDValue Z = EFLAGS.getOperand(0);
50546 EVT ZVT = Z.getValueType();
50547
50548 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50549 // the general case below.
50550 if (ConstantX) {
50551 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
50552 // fake operands:
50553 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
50554 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
50555 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
50556 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
50557 SDValue Zero = DAG.getConstant(0, DL, ZVT);
50558 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50559 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
50560 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50561 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50562 SDValue(Neg.getNode(), 1));
50563 }
50564
50565 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
50566 // with fake operands:
50567 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
50568 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
50569 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
50570 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
50571 SDValue One = DAG.getConstant(1, DL, ZVT);
50572 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50573 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50574 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50575 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50576 Cmp1.getValue(1));
50577 }
50578 }
50579
50580 // (cmp Z, 1) sets the carry flag if Z is 0.
50581 SDValue One = DAG.getConstant(1, DL, ZVT);
50582 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50583 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50584
50585 // Add the flags type for ADC/SBB nodes.
50586 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50587
50588 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
50589 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
50590 if (CC == X86::COND_NE)
50591 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
50592 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
50593
50594 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
50595 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
50596 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
50597 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
50598}
50599
50600/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50601/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50602/// with CMP+{ADC, SBB}.
50604 SelectionDAG &DAG) {
50605 bool IsSub = N->getOpcode() == ISD::SUB;
50606 SDValue X = N->getOperand(0);
50607 SDValue Y = N->getOperand(1);
50608 EVT VT = N->getValueType(0);
50609
50610 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
50611 return ADCOrSBB;
50612
50613 // Commute and try again (negate the result for subtracts).
50614 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
50615 if (IsSub)
50616 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
50617 return ADCOrSBB;
50618 }
50619
50620 return SDValue();
50621}
50622
50624 SelectionDAG &DAG) {
50625 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&
50626 "Unexpected opcode");
50627
50628 // Delegate to combineAddOrSubToADCOrSBB if we have:
50629 //
50630 // (xor/or (zero_extend (setcc)) imm)
50631 //
50632 // where imm is odd if and only if we have xor, in which case the XOR/OR are
50633 // equivalent to a SUB/ADD, respectively.
50634 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
50635 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
50636 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
50637 bool IsSub = N->getOpcode() == ISD::XOR;
50638 bool N1COdd = N1C->getZExtValue() & 1;
50639 if (IsSub ? N1COdd : !N1COdd) {
50640 SDLoc DL(N);
50641 EVT VT = N->getValueType(0);
50642 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
50643 return R;
50644 }
50645 }
50646 }
50647
50648 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
50649 if (N->getOpcode() == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
50650 N0.getOperand(0).getOpcode() == ISD::AND &&
50653 MVT VT = N->getSimpleValueType(0);
50654 APInt UndefElts;
50655 SmallVector<APInt> EltBits;
50657 VT.getScalarSizeInBits(), UndefElts,
50658 EltBits)) {
50659 bool IsPow2OrUndef = true;
50660 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
50661 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
50662
50663 if (IsPow2OrUndef)
50664 return DAG.getNode(X86ISD::PCMPEQ, SDLoc(N), VT, N0.getOperand(0),
50665 N0.getOperand(0).getOperand(1));
50666 }
50667 }
50668
50669 return SDValue();
50670}
50671
50674 const X86Subtarget &Subtarget) {
50675 SDValue N0 = N->getOperand(0);
50676 SDValue N1 = N->getOperand(1);
50677 EVT VT = N->getValueType(0);
50678 SDLoc dl(N);
50679 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50680
50681 // If this is SSE1 only convert to FOR to avoid scalarization.
50682 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
50683 return DAG.getBitcast(MVT::v4i32,
50684 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
50685 DAG.getBitcast(MVT::v4f32, N0),
50686 DAG.getBitcast(MVT::v4f32, N1)));
50687 }
50688
50689 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
50690 // TODO: Support multiple SrcOps.
50691 if (VT == MVT::i1) {
50693 SmallVector<APInt, 2> SrcPartials;
50694 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
50695 SrcOps.size() == 1) {
50696 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
50697 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50698 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
50699 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
50700 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
50701 if (Mask) {
50702 assert(SrcPartials[0].getBitWidth() == NumElts &&
50703 "Unexpected partial reduction mask");
50704 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
50705 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
50706 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
50707 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
50708 }
50709 }
50710 }
50711
50712 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
50713 return SetCC;
50714
50715 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50716 return R;
50717
50718 if (SDValue R = combineBitOpWithShift(N, DAG))
50719 return R;
50720
50721 if (SDValue R = combineBitOpWithPACK(N, DAG))
50722 return R;
50723
50724 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50725 return FPLogic;
50726
50727 if (DCI.isBeforeLegalizeOps())
50728 return SDValue();
50729
50730 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
50731 return R;
50732
50733 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
50734 return R;
50735
50736 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
50737 return R;
50738
50739 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
50740 if ((VT == MVT::i32 || VT == MVT::i64) &&
50741 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
50742 isNullConstant(N0.getOperand(0))) {
50743 SDValue Cond = N0.getOperand(1);
50744 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
50745 Cond = Cond.getOperand(0);
50746
50747 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
50748 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
50749 uint64_t Val = CN->getZExtValue();
50750 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
50751 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
50752 CCode = X86::GetOppositeBranchCondition(CCode);
50753 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
50754
50755 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
50756 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
50757 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
50758 return R;
50759 }
50760 }
50761 }
50762 }
50763
50764 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
50765 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
50766 // iff the upper elements of the non-shifted arg are zero.
50767 // KUNPCK require 16+ bool vector elements.
50768 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
50769 unsigned NumElts = VT.getVectorNumElements();
50770 unsigned HalfElts = NumElts / 2;
50771 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
50772 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
50773 N1.getConstantOperandAPInt(1) == HalfElts &&
50774 DAG.MaskedVectorIsZero(N0, UpperElts)) {
50775 return DAG.getNode(
50776 ISD::CONCAT_VECTORS, dl, VT,
50777 extractSubVector(N0, 0, DAG, dl, HalfElts),
50778 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
50779 }
50780 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
50781 N0.getConstantOperandAPInt(1) == HalfElts &&
50782 DAG.MaskedVectorIsZero(N1, UpperElts)) {
50783 return DAG.getNode(
50784 ISD::CONCAT_VECTORS, dl, VT,
50785 extractSubVector(N1, 0, DAG, dl, HalfElts),
50786 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
50787 }
50788 }
50789
50790 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50791 // Attempt to recursively combine an OR of shuffles.
50792 SDValue Op(N, 0);
50793 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50794 return Res;
50795
50796 // If either operand is a constant mask, then only the elements that aren't
50797 // allones are actually demanded by the other operand.
50798 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
50799 APInt UndefElts;
50800 SmallVector<APInt> EltBits;
50801 int NumElts = VT.getVectorNumElements();
50802 int EltSizeInBits = VT.getScalarSizeInBits();
50803 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
50804 return false;
50805
50806 APInt DemandedElts = APInt::getZero(NumElts);
50807 for (int I = 0; I != NumElts; ++I)
50808 if (!EltBits[I].isAllOnes())
50809 DemandedElts.setBit(I);
50810
50811 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
50812 };
50813 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
50814 if (N->getOpcode() != ISD::DELETED_NODE)
50815 DCI.AddToWorklist(N);
50816 return SDValue(N, 0);
50817 }
50818 }
50819
50820 // We should fold "masked merge" patterns when `andn` is not available.
50821 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
50822 if (SDValue R = foldMaskedMerge(N, DAG))
50823 return R;
50824
50825 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
50826 return R;
50827
50828 return SDValue();
50829}
50830
50831/// Try to turn tests against the signbit in the form of:
50832/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
50833/// into:
50834/// SETGT(X, -1)
50836 // This is only worth doing if the output type is i8 or i1.
50837 EVT ResultType = N->getValueType(0);
50838 if (ResultType != MVT::i8 && ResultType != MVT::i1)
50839 return SDValue();
50840
50841 SDValue N0 = N->getOperand(0);
50842 SDValue N1 = N->getOperand(1);
50843
50844 // We should be performing an xor against a truncated shift.
50845 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
50846 return SDValue();
50847
50848 // Make sure we are performing an xor against one.
50849 if (!isOneConstant(N1))
50850 return SDValue();
50851
50852 // SetCC on x86 zero extends so only act on this if it's a logical shift.
50853 SDValue Shift = N0.getOperand(0);
50854 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
50855 return SDValue();
50856
50857 // Make sure we are truncating from one of i16, i32 or i64.
50858 EVT ShiftTy = Shift.getValueType();
50859 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
50860 return SDValue();
50861
50862 // Make sure the shift amount extracts the sign bit.
50863 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
50864 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
50865 return SDValue();
50866
50867 // Create a greater-than comparison against -1.
50868 // N.B. Using SETGE against 0 works but we want a canonical looking
50869 // comparison, using SETGT matches up with what TranslateX86CC.
50870 SDLoc DL(N);
50871 SDValue ShiftOp = Shift.getOperand(0);
50872 EVT ShiftOpTy = ShiftOp.getValueType();
50873 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50874 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
50875 *DAG.getContext(), ResultType);
50876 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
50877 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
50878 if (SetCCResultType != ResultType)
50879 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
50880 return Cond;
50881}
50882
50883/// Turn vector tests of the signbit in the form of:
50884/// xor (sra X, elt_size(X)-1), -1
50885/// into:
50886/// pcmpgt X, -1
50887///
50888/// This should be called before type legalization because the pattern may not
50889/// persist after that.
50891 const X86Subtarget &Subtarget) {
50892 EVT VT = N->getValueType(0);
50893 if (!VT.isSimple())
50894 return SDValue();
50895
50896 switch (VT.getSimpleVT().SimpleTy) {
50897 // clang-format off
50898 default: return SDValue();
50899 case MVT::v16i8:
50900 case MVT::v8i16:
50901 case MVT::v4i32:
50902 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
50903 case MVT::v32i8:
50904 case MVT::v16i16:
50905 case MVT::v8i32:
50906 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
50907 // clang-format on
50908 }
50909
50910 // There must be a shift right algebraic before the xor, and the xor must be a
50911 // 'not' operation.
50912 SDValue Shift = N->getOperand(0);
50913 SDValue Ones = N->getOperand(1);
50914 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
50916 return SDValue();
50917
50918 // The shift should be smearing the sign bit across each vector element.
50919 auto *ShiftAmt =
50920 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
50921 if (!ShiftAmt ||
50922 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
50923 return SDValue();
50924
50925 // Create a greater-than comparison against -1. We don't use the more obvious
50926 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
50927 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
50928}
50929
50930/// Detect patterns of truncation with unsigned saturation:
50931///
50932/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
50933/// Return the source value x to be truncated or SDValue() if the pattern was
50934/// not matched.
50935///
50936/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
50937/// where C1 >= 0 and C2 is unsigned max of destination type.
50938///
50939/// (truncate (smax (smin (x, C2), C1)) to dest_type)
50940/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
50941///
50942/// These two patterns are equivalent to:
50943/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
50944/// So return the smax(x, C1) value to be truncated or SDValue() if the
50945/// pattern was not matched.
50947 const SDLoc &DL) {
50948 EVT InVT = In.getValueType();
50949
50950 // Saturation with truncation. We truncate from InVT to VT.
50952 "Unexpected types for truncate operation");
50953
50954 // Match min/max and return limit value as a parameter.
50955 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
50956 if (V.getOpcode() == Opcode &&
50957 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
50958 return V.getOperand(0);
50959 return SDValue();
50960 };
50961
50962 APInt C1, C2;
50963 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
50964 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
50965 // the element size of the destination type.
50966 if (C2.isMask(VT.getScalarSizeInBits()))
50967 return UMin;
50968
50969 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
50970 if (MatchMinMax(SMin, ISD::SMAX, C1))
50971 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
50972 return SMin;
50973
50974 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
50975 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
50976 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
50977 C2.uge(C1)) {
50978 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
50979 }
50980
50981 return SDValue();
50982}
50983
50984/// Detect patterns of truncation with signed saturation:
50985/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
50986/// signed_max_of_dest_type)) to dest_type)
50987/// or:
50988/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
50989/// signed_min_of_dest_type)) to dest_type).
50990/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
50991/// Return the source value to be truncated or SDValue() if the pattern was not
50992/// matched.
50993static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
50994 unsigned NumDstBits = VT.getScalarSizeInBits();
50995 unsigned NumSrcBits = In.getScalarValueSizeInBits();
50996 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
50997
50998 auto MatchMinMax = [](SDValue V, unsigned Opcode,
50999 const APInt &Limit) -> SDValue {
51000 APInt C;
51001 if (V.getOpcode() == Opcode &&
51002 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
51003 return V.getOperand(0);
51004 return SDValue();
51005 };
51006
51007 APInt SignedMax, SignedMin;
51008 if (MatchPackUS) {
51009 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
51010 SignedMin = APInt(NumSrcBits, 0);
51011 } else {
51012 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
51013 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
51014 }
51015
51016 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
51017 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
51018 return SMax;
51019
51020 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
51021 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
51022 return SMin;
51023
51024 return SDValue();
51025}
51026
51028 SelectionDAG &DAG,
51029 const X86Subtarget &Subtarget) {
51030 if (!Subtarget.hasSSE2() || !VT.isVector())
51031 return SDValue();
51032
51033 EVT SVT = VT.getVectorElementType();
51034 EVT InVT = In.getValueType();
51035 EVT InSVT = InVT.getVectorElementType();
51036
51037 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
51038 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
51039 // and concatenate at the same time. Then we can use a final vpmovuswb to
51040 // clip to 0-255.
51041 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
51042 InVT == MVT::v16i32 && VT == MVT::v16i8) {
51043 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51044 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
51045 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
51046 DL, DAG, Subtarget);
51047 assert(Mid && "Failed to pack!");
51048 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
51049 }
51050 }
51051
51052 // vXi32 truncate instructions are available with AVX512F.
51053 // vXi16 truncate instructions are only available with AVX512BW.
51054 // For 256-bit or smaller vectors, we require VLX.
51055 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
51056 // If the result type is 256-bits or larger and we have disable 512-bit
51057 // registers, we should go ahead and use the pack instructions if possible.
51058 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
51059 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
51060 (InVT.getSizeInBits() > 128) &&
51061 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
51062 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
51063
51064 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
51066 (SVT == MVT::i8 || SVT == MVT::i16) &&
51067 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
51068 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51069 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
51070 if (SVT == MVT::i8 && InSVT == MVT::i32) {
51071 EVT MidVT = VT.changeVectorElementType(MVT::i16);
51072 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
51073 DAG, Subtarget);
51074 assert(Mid && "Failed to pack!");
51076 Subtarget);
51077 assert(V && "Failed to pack!");
51078 return V;
51079 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
51080 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
51081 Subtarget);
51082 }
51083 if (SDValue SSatVal = detectSSatPattern(In, VT))
51084 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
51085 Subtarget);
51086 }
51087
51088 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51089 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
51090 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
51091 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
51092 unsigned TruncOpc = 0;
51093 SDValue SatVal;
51094 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
51095 SatVal = SSatVal;
51096 TruncOpc = X86ISD::VTRUNCS;
51097 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
51098 SatVal = USatVal;
51099 TruncOpc = X86ISD::VTRUNCUS;
51100 }
51101 if (SatVal) {
51102 unsigned ResElts = VT.getVectorNumElements();
51103 // If the input type is less than 512 bits and we don't have VLX, we need
51104 // to widen to 512 bits.
51105 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
51106 unsigned NumConcats = 512 / InVT.getSizeInBits();
51107 ResElts *= NumConcats;
51108 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
51109 ConcatOps[0] = SatVal;
51110 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
51111 NumConcats * InVT.getVectorNumElements());
51112 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
51113 }
51114 // Widen the result if its narrower than 128 bits.
51115 if (ResElts * SVT.getSizeInBits() < 128)
51116 ResElts = 128 / SVT.getSizeInBits();
51117 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
51118 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
51119 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
51120 DAG.getIntPtrConstant(0, DL));
51121 }
51122 }
51123
51124 return SDValue();
51125}
51126
51128 SelectionDAG &DAG,
51130 const X86Subtarget &Subtarget) {
51131 auto *Ld = cast<LoadSDNode>(N);
51132 EVT RegVT = Ld->getValueType(0);
51133 SDValue Ptr = Ld->getBasePtr();
51134 SDValue Chain = Ld->getChain();
51135 ISD::LoadExtType Ext = Ld->getExtensionType();
51136
51137 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
51138 return SDValue();
51139
51140 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
51141 return SDValue();
51142
51144 if (!LdC)
51145 return SDValue();
51146
51147 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
51148 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
51149 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
51150 if (Undefs[I])
51151 continue;
51152 if (UserUndefs[I] || Bits[I] != UserBits[I])
51153 return false;
51154 }
51155 return true;
51156 };
51157
51158 // Look through all other loads/broadcasts in the chain for another constant
51159 // pool entry.
51160 for (SDNode *User : Chain->uses()) {
51161 auto *UserLd = dyn_cast<MemSDNode>(User);
51162 if (User != N && UserLd &&
51163 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
51164 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
51166 UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
51167 User->getValueSizeInBits(0).getFixedValue() >
51168 RegVT.getFixedSizeInBits()) {
51169 EVT UserVT = User->getValueType(0);
51170 SDValue UserPtr = UserLd->getBasePtr();
51171 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
51172
51173 // See if we are loading a constant that matches in the lower
51174 // bits of a longer constant (but from a different constant pool ptr).
51175 if (UserC && UserPtr != Ptr) {
51176 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
51177 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
51178 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
51179 APInt Undefs, UserUndefs;
51180 SmallVector<APInt> Bits, UserBits;
51181 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
51182 UserVT.getScalarSizeInBits());
51183 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
51184 Bits) &&
51186 UserUndefs, UserBits)) {
51187 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
51188 SDValue Extract = extractSubVector(
51189 SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
51190 Extract = DAG.getBitcast(RegVT, Extract);
51191 return DCI.CombineTo(N, Extract, SDValue(User, 1));
51192 }
51193 }
51194 }
51195 }
51196 }
51197 }
51198
51199 return SDValue();
51200}
51201
51204 const X86Subtarget &Subtarget) {
51205 auto *Ld = cast<LoadSDNode>(N);
51206 EVT RegVT = Ld->getValueType(0);
51207 EVT MemVT = Ld->getMemoryVT();
51208 SDLoc dl(Ld);
51209 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51210
51211 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
51212 // into two 16-byte operations. Also split non-temporal aligned loads on
51213 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
51214 ISD::LoadExtType Ext = Ld->getExtensionType();
51215 unsigned Fast;
51216 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
51217 Ext == ISD::NON_EXTLOAD &&
51218 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
51219 Ld->getAlign() >= Align(16)) ||
51220 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
51221 *Ld->getMemOperand(), &Fast) &&
51222 !Fast))) {
51223 unsigned NumElems = RegVT.getVectorNumElements();
51224 if (NumElems < 2)
51225 return SDValue();
51226
51227 unsigned HalfOffset = 16;
51228 SDValue Ptr1 = Ld->getBasePtr();
51229 SDValue Ptr2 =
51230 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
51231 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
51232 NumElems / 2);
51233 SDValue Load1 =
51234 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
51235 Ld->getOriginalAlign(),
51236 Ld->getMemOperand()->getFlags());
51237 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
51238 Ld->getPointerInfo().getWithOffset(HalfOffset),
51239 Ld->getOriginalAlign(),
51240 Ld->getMemOperand()->getFlags());
51241 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
51242 Load1.getValue(1), Load2.getValue(1));
51243
51244 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
51245 return DCI.CombineTo(N, NewVec, TF, true);
51246 }
51247
51248 // Bool vector load - attempt to cast to an integer, as we have good
51249 // (vXiY *ext(vXi1 bitcast(iX))) handling.
51250 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
51251 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
51252 unsigned NumElts = RegVT.getVectorNumElements();
51253 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51254 if (TLI.isTypeLegal(IntVT)) {
51255 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
51256 Ld->getPointerInfo(),
51257 Ld->getOriginalAlign(),
51258 Ld->getMemOperand()->getFlags());
51259 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
51260 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
51261 }
51262 }
51263
51264 // If we also broadcast this vector to a wider type, then just extract the
51265 // lowest subvector.
51266 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
51267 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
51268 SDValue Ptr = Ld->getBasePtr();
51269 SDValue Chain = Ld->getChain();
51270 for (SDNode *User : Chain->uses()) {
51271 auto *UserLd = dyn_cast<MemSDNode>(User);
51272 if (User != N && UserLd &&
51273 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
51274 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
51275 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
51276 !User->hasAnyUseOfValue(1) &&
51277 User->getValueSizeInBits(0).getFixedValue() >
51278 RegVT.getFixedSizeInBits()) {
51279 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
51280 RegVT.getSizeInBits());
51281 Extract = DAG.getBitcast(RegVT, Extract);
51282 return DCI.CombineTo(N, Extract, SDValue(User, 1));
51283 }
51284 }
51285 }
51286
51287 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
51288 return V;
51289
51290 // Cast ptr32 and ptr64 pointers to the default address space before a load.
51291 unsigned AddrSpace = Ld->getAddressSpace();
51292 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51293 AddrSpace == X86AS::PTR32_UPTR) {
51294 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51295 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
51296 SDValue Cast =
51297 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
51298 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
51299 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
51300 Ld->getMemOperand()->getFlags());
51301 }
51302 }
51303
51304 return SDValue();
51305}
51306
51307/// If V is a build vector of boolean constants and exactly one of those
51308/// constants is true, return the operand index of that true element.
51309/// Otherwise, return -1.
51310static int getOneTrueElt(SDValue V) {
51311 // This needs to be a build vector of booleans.
51312 // TODO: Checking for the i1 type matches the IR definition for the mask,
51313 // but the mask check could be loosened to i8 or other types. That might
51314 // also require checking more than 'allOnesValue'; eg, the x86 HW
51315 // instructions only require that the MSB is set for each mask element.
51316 // The ISD::MSTORE comments/definition do not specify how the mask operand
51317 // is formatted.
51318 auto *BV = dyn_cast<BuildVectorSDNode>(V);
51319 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
51320 return -1;
51321
51322 int TrueIndex = -1;
51323 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
51324 for (unsigned i = 0; i < NumElts; ++i) {
51325 const SDValue &Op = BV->getOperand(i);
51326 if (Op.isUndef())
51327 continue;
51328 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
51329 if (!ConstNode)
51330 return -1;
51331 if (ConstNode->getAPIntValue().countr_one() >= 1) {
51332 // If we already found a one, this is too many.
51333 if (TrueIndex >= 0)
51334 return -1;
51335 TrueIndex = i;
51336 }
51337 }
51338 return TrueIndex;
51339}
51340
51341/// Given a masked memory load/store operation, return true if it has one mask
51342/// bit set. If it has one mask bit set, then also return the memory address of
51343/// the scalar element to load/store, the vector index to insert/extract that
51344/// scalar element, and the alignment for the scalar memory access.
51346 SelectionDAG &DAG, SDValue &Addr,
51347 SDValue &Index, Align &Alignment,
51348 unsigned &Offset) {
51349 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
51350 if (TrueMaskElt < 0)
51351 return false;
51352
51353 // Get the address of the one scalar element that is specified by the mask
51354 // using the appropriate offset from the base pointer.
51355 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
51356 Offset = 0;
51357 Addr = MaskedOp->getBasePtr();
51358 if (TrueMaskElt != 0) {
51359 Offset = TrueMaskElt * EltVT.getStoreSize();
51361 SDLoc(MaskedOp));
51362 }
51363
51364 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
51365 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
51366 EltVT.getStoreSize());
51367 return true;
51368}
51369
51370/// If exactly one element of the mask is set for a non-extending masked load,
51371/// it is a scalar load and vector insert.
51372/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51373/// mask have already been optimized in IR, so we don't bother with those here.
51374static SDValue
51377 const X86Subtarget &Subtarget) {
51378 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
51379 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51380 // However, some target hooks may need to be added to know when the transform
51381 // is profitable. Endianness would also have to be considered.
51382
51383 SDValue Addr, VecIndex;
51384 Align Alignment;
51385 unsigned Offset;
51386 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
51387 return SDValue();
51388
51389 // Load the one scalar element that is specified by the mask using the
51390 // appropriate offset from the base pointer.
51391 SDLoc DL(ML);
51392 EVT VT = ML->getValueType(0);
51393 EVT EltVT = VT.getVectorElementType();
51394
51395 EVT CastVT = VT;
51396 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51397 EltVT = MVT::f64;
51398 CastVT = VT.changeVectorElementType(EltVT);
51399 }
51400
51401 SDValue Load =
51402 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
51403 ML->getPointerInfo().getWithOffset(Offset),
51404 Alignment, ML->getMemOperand()->getFlags());
51405
51406 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
51407
51408 // Insert the loaded element into the appropriate place in the vector.
51409 SDValue Insert =
51410 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
51411 Insert = DAG.getBitcast(VT, Insert);
51412 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
51413}
51414
51415static SDValue
51418 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
51419 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
51420 return SDValue();
51421
51422 SDLoc DL(ML);
51423 EVT VT = ML->getValueType(0);
51424
51425 // If we are loading the first and last elements of a vector, it is safe and
51426 // always faster to load the whole vector. Replace the masked load with a
51427 // vector load and select.
51428 unsigned NumElts = VT.getVectorNumElements();
51429 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
51430 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
51431 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
51432 if (LoadFirstElt && LoadLastElt) {
51433 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
51434 ML->getMemOperand());
51435 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
51436 ML->getPassThru());
51437 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
51438 }
51439
51440 // Convert a masked load with a constant mask into a masked load and a select.
51441 // This allows the select operation to use a faster kind of select instruction
51442 // (for example, vblendvps -> vblendps).
51443
51444 // Don't try this if the pass-through operand is already undefined. That would
51445 // cause an infinite loop because that's what we're about to create.
51446 if (ML->getPassThru().isUndef())
51447 return SDValue();
51448
51449 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
51450 return SDValue();
51451
51452 // The new masked load has an undef pass-through operand. The select uses the
51453 // original pass-through operand.
51454 SDValue NewML = DAG.getMaskedLoad(
51455 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
51456 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
51457 ML->getAddressingMode(), ML->getExtensionType());
51458 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
51459 ML->getPassThru());
51460
51461 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
51462}
51463
51466 const X86Subtarget &Subtarget) {
51467 auto *Mld = cast<MaskedLoadSDNode>(N);
51468
51469 // TODO: Expanding load with constant mask may be optimized as well.
51470 if (Mld->isExpandingLoad())
51471 return SDValue();
51472
51473 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
51474 if (SDValue ScalarLoad =
51475 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
51476 return ScalarLoad;
51477
51478 // TODO: Do some AVX512 subsets benefit from this transform?
51479 if (!Subtarget.hasAVX512())
51480 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
51481 return Blend;
51482 }
51483
51484 // If the mask value has been legalized to a non-boolean vector, try to
51485 // simplify ops leading up to it. We only demand the MSB of each lane.
51486 SDValue Mask = Mld->getMask();
51487 if (Mask.getScalarValueSizeInBits() != 1) {
51488 EVT VT = Mld->getValueType(0);
51489 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51491 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51492 if (N->getOpcode() != ISD::DELETED_NODE)
51493 DCI.AddToWorklist(N);
51494 return SDValue(N, 0);
51495 }
51496 if (SDValue NewMask =
51498 return DAG.getMaskedLoad(
51499 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
51500 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
51501 Mld->getAddressingMode(), Mld->getExtensionType());
51502 }
51503
51504 return SDValue();
51505}
51506
51507/// If exactly one element of the mask is set for a non-truncating masked store,
51508/// it is a vector extract and scalar store.
51509/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51510/// mask have already been optimized in IR, so we don't bother with those here.
51512 SelectionDAG &DAG,
51513 const X86Subtarget &Subtarget) {
51514 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51515 // However, some target hooks may need to be added to know when the transform
51516 // is profitable. Endianness would also have to be considered.
51517
51518 SDValue Addr, VecIndex;
51519 Align Alignment;
51520 unsigned Offset;
51521 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
51522 return SDValue();
51523
51524 // Extract the one scalar element that is actually being stored.
51525 SDLoc DL(MS);
51526 SDValue Value = MS->getValue();
51527 EVT VT = Value.getValueType();
51528 EVT EltVT = VT.getVectorElementType();
51529 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51530 EltVT = MVT::f64;
51531 EVT CastVT = VT.changeVectorElementType(EltVT);
51532 Value = DAG.getBitcast(CastVT, Value);
51533 }
51534 SDValue Extract =
51535 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
51536
51537 // Store that element at the appropriate offset from the base pointer.
51538 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
51540 Alignment, MS->getMemOperand()->getFlags());
51541}
51542
51545 const X86Subtarget &Subtarget) {
51546 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
51547 if (Mst->isCompressingStore())
51548 return SDValue();
51549
51550 EVT VT = Mst->getValue().getValueType();
51551 SDLoc dl(Mst);
51552 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51553
51554 if (Mst->isTruncatingStore())
51555 return SDValue();
51556
51557 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
51558 return ScalarStore;
51559
51560 // If the mask value has been legalized to a non-boolean vector, try to
51561 // simplify ops leading up to it. We only demand the MSB of each lane.
51562 SDValue Mask = Mst->getMask();
51563 if (Mask.getScalarValueSizeInBits() != 1) {
51565 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51566 if (N->getOpcode() != ISD::DELETED_NODE)
51567 DCI.AddToWorklist(N);
51568 return SDValue(N, 0);
51569 }
51570 if (SDValue NewMask =
51572 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
51573 Mst->getBasePtr(), Mst->getOffset(), NewMask,
51574 Mst->getMemoryVT(), Mst->getMemOperand(),
51575 Mst->getAddressingMode());
51576 }
51577
51578 SDValue Value = Mst->getValue();
51579 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
51580 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
51581 Mst->getMemoryVT())) {
51582 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
51583 Mst->getBasePtr(), Mst->getOffset(), Mask,
51584 Mst->getMemoryVT(), Mst->getMemOperand(),
51585 Mst->getAddressingMode(), true);
51586 }
51587
51588 return SDValue();
51589}
51590
51593 const X86Subtarget &Subtarget) {
51594 StoreSDNode *St = cast<StoreSDNode>(N);
51595 EVT StVT = St->getMemoryVT();
51596 SDLoc dl(St);
51597 SDValue StoredVal = St->getValue();
51598 EVT VT = StoredVal.getValueType();
51599 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51600
51601 // Convert a store of vXi1 into a store of iX and a bitcast.
51602 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
51603 VT.getVectorElementType() == MVT::i1) {
51604
51606 StoredVal = DAG.getBitcast(NewVT, StoredVal);
51607
51608 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51609 St->getPointerInfo(), St->getOriginalAlign(),
51610 St->getMemOperand()->getFlags());
51611 }
51612
51613 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
51614 // This will avoid a copy to k-register.
51615 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
51616 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
51617 StoredVal.getOperand(0).getValueType() == MVT::i8) {
51618 SDValue Val = StoredVal.getOperand(0);
51619 // We must store zeros to the unused bits.
51620 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
51621 return DAG.getStore(St->getChain(), dl, Val,
51622 St->getBasePtr(), St->getPointerInfo(),
51623 St->getOriginalAlign(),
51624 St->getMemOperand()->getFlags());
51625 }
51626
51627 // Widen v2i1/v4i1 stores to v8i1.
51628 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
51629 Subtarget.hasAVX512()) {
51630 unsigned NumConcats = 8 / VT.getVectorNumElements();
51631 // We must store zeros to the unused bits.
51632 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
51633 Ops[0] = StoredVal;
51634 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
51635 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51636 St->getPointerInfo(), St->getOriginalAlign(),
51637 St->getMemOperand()->getFlags());
51638 }
51639
51640 // Turn vXi1 stores of constants into a scalar store.
51641 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
51642 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
51644 // If its a v64i1 store without 64-bit support, we need two stores.
51645 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
51646 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
51647 StoredVal->ops().slice(0, 32));
51649 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
51650 StoredVal->ops().slice(32, 32));
51652
51653 SDValue Ptr0 = St->getBasePtr();
51654 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
51655
51656 SDValue Ch0 =
51657 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
51658 St->getOriginalAlign(),
51659 St->getMemOperand()->getFlags());
51660 SDValue Ch1 =
51661 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
51663 St->getOriginalAlign(),
51664 St->getMemOperand()->getFlags());
51665 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
51666 }
51667
51668 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
51669 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51670 St->getPointerInfo(), St->getOriginalAlign(),
51671 St->getMemOperand()->getFlags());
51672 }
51673
51674 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
51675 // Sandy Bridge, perform two 16-byte stores.
51676 unsigned Fast;
51677 if (VT.is256BitVector() && StVT == VT &&
51678 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
51679 *St->getMemOperand(), &Fast) &&
51680 !Fast) {
51681 unsigned NumElems = VT.getVectorNumElements();
51682 if (NumElems < 2)
51683 return SDValue();
51684
51685 return splitVectorStore(St, DAG);
51686 }
51687
51688 // Split under-aligned vector non-temporal stores.
51689 if (St->isNonTemporal() && StVT == VT &&
51690 St->getAlign().value() < VT.getStoreSize()) {
51691 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
51692 // vectors or the legalizer can scalarize it to use MOVNTI.
51693 if (VT.is256BitVector() || VT.is512BitVector()) {
51694 unsigned NumElems = VT.getVectorNumElements();
51695 if (NumElems < 2)
51696 return SDValue();
51697 return splitVectorStore(St, DAG);
51698 }
51699
51700 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
51701 // to use MOVNTI.
51702 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
51703 MVT NTVT = Subtarget.hasSSE4A()
51704 ? MVT::v2f64
51705 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
51706 return scalarizeVectorStore(St, NTVT, DAG);
51707 }
51708 }
51709
51710 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
51711 // supported, but avx512f is by extending to v16i32 and truncating.
51712 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
51713 St->getValue().getOpcode() == ISD::TRUNCATE &&
51714 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
51715 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
51716 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
51717 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
51718 St->getValue().getOperand(0));
51719 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
51720 MVT::v16i8, St->getMemOperand());
51721 }
51722
51723 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
51724 if (!St->isTruncatingStore() &&
51725 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
51726 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
51727 StoredVal.hasOneUse() &&
51728 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
51729 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
51730 return EmitTruncSStore(IsSigned, St->getChain(),
51731 dl, StoredVal.getOperand(0), St->getBasePtr(),
51732 VT, St->getMemOperand(), DAG);
51733 }
51734
51735 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
51736 if (!St->isTruncatingStore()) {
51737 auto IsExtractedElement = [](SDValue V) {
51738 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
51739 V = V.getOperand(0);
51740 unsigned Opc = V.getOpcode();
51741 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
51742 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
51743 V.getOperand(0).hasOneUse())
51744 return V.getOperand(0);
51745 return SDValue();
51746 };
51747 if (SDValue Extract = IsExtractedElement(StoredVal)) {
51748 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
51749 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
51750 SDValue Src = Trunc.getOperand(0);
51751 MVT DstVT = Trunc.getSimpleValueType();
51752 MVT SrcVT = Src.getSimpleValueType();
51753 unsigned NumSrcElts = SrcVT.getVectorNumElements();
51754 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
51755 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
51756 if (NumTruncBits == VT.getSizeInBits() &&
51757 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
51758 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
51759 TruncVT, St->getMemOperand());
51760 }
51761 }
51762 }
51763 }
51764
51765 // Optimize trunc store (of multiple scalars) to shuffle and store.
51766 // First, pack all of the elements in one place. Next, store to memory
51767 // in fewer chunks.
51768 if (St->isTruncatingStore() && VT.isVector()) {
51769 if (TLI.isTruncStoreLegal(VT, StVT)) {
51770 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
51771 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
51772 dl, Val, St->getBasePtr(),
51773 St->getMemoryVT(), St->getMemOperand(), DAG);
51774 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
51775 DAG, dl))
51776 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
51777 dl, Val, St->getBasePtr(),
51778 St->getMemoryVT(), St->getMemOperand(), DAG);
51779 }
51780
51781 return SDValue();
51782 }
51783
51784 // Cast ptr32 and ptr64 pointers to the default address space before a store.
51785 unsigned AddrSpace = St->getAddressSpace();
51786 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51787 AddrSpace == X86AS::PTR32_UPTR) {
51788 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51789 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
51790 SDValue Cast =
51791 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
51792 return DAG.getTruncStore(
51793 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
51794 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
51795 St->getAAInfo());
51796 }
51797 }
51798
51799 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
51800 // the FP state in cases where an emms may be missing.
51801 // A preferable solution to the general problem is to figure out the right
51802 // places to insert EMMS. This qualifies as a quick hack.
51803
51804 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
51805 if (VT.getSizeInBits() != 64)
51806 return SDValue();
51807
51808 const Function &F = DAG.getMachineFunction().getFunction();
51809 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
51810 bool F64IsLegal =
51811 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
51812
51813 if (!F64IsLegal || Subtarget.is64Bit())
51814 return SDValue();
51815
51816 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
51817 cast<LoadSDNode>(St->getValue())->isSimple() &&
51818 St->getChain().hasOneUse() && St->isSimple()) {
51819 auto *Ld = cast<LoadSDNode>(St->getValue());
51820
51821 if (!ISD::isNormalLoad(Ld))
51822 return SDValue();
51823
51824 // Avoid the transformation if there are multiple uses of the loaded value.
51825 if (!Ld->hasNUsesOfValue(1, 0))
51826 return SDValue();
51827
51828 SDLoc LdDL(Ld);
51829 SDLoc StDL(N);
51830 // Lower to a single movq load/store pair.
51831 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
51832 Ld->getBasePtr(), Ld->getMemOperand());
51833
51834 // Make sure new load is placed in same chain order.
51835 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
51836 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
51837 St->getMemOperand());
51838 }
51839
51840 // This is similar to the above case, but here we handle a scalar 64-bit
51841 // integer store that is extracted from a vector on a 32-bit target.
51842 // If we have SSE2, then we can treat it like a floating-point double
51843 // to get past legalization. The execution dependencies fixup pass will
51844 // choose the optimal machine instruction for the store if this really is
51845 // an integer or v2f32 rather than an f64.
51846 if (VT == MVT::i64 &&
51848 SDValue OldExtract = St->getOperand(1);
51849 SDValue ExtOp0 = OldExtract.getOperand(0);
51850 unsigned VecSize = ExtOp0.getValueSizeInBits();
51851 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
51852 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
51853 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
51854 BitCast, OldExtract.getOperand(1));
51855 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
51856 St->getPointerInfo(), St->getOriginalAlign(),
51857 St->getMemOperand()->getFlags());
51858 }
51859
51860 return SDValue();
51861}
51862
51865 const X86Subtarget &Subtarget) {
51866 auto *St = cast<MemIntrinsicSDNode>(N);
51867
51868 SDValue StoredVal = N->getOperand(1);
51869 MVT VT = StoredVal.getSimpleValueType();
51870 EVT MemVT = St->getMemoryVT();
51871
51872 // Figure out which elements we demand.
51873 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
51874 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
51875
51876 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51877 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
51878 if (N->getOpcode() != ISD::DELETED_NODE)
51879 DCI.AddToWorklist(N);
51880 return SDValue(N, 0);
51881 }
51882
51883 return SDValue();
51884}
51885
51886/// Return 'true' if this vector operation is "horizontal"
51887/// and return the operands for the horizontal operation in LHS and RHS. A
51888/// horizontal operation performs the binary operation on successive elements
51889/// of its first operand, then on successive elements of its second operand,
51890/// returning the resulting values in a vector. For example, if
51891/// A = < float a0, float a1, float a2, float a3 >
51892/// and
51893/// B = < float b0, float b1, float b2, float b3 >
51894/// then the result of doing a horizontal operation on A and B is
51895/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
51896/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
51897/// A horizontal-op B, for some already available A and B, and if so then LHS is
51898/// set to A, RHS to B, and the routine returns 'true'.
51899static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
51900 SelectionDAG &DAG, const X86Subtarget &Subtarget,
51901 bool IsCommutative,
51902 SmallVectorImpl<int> &PostShuffleMask,
51903 bool ForceHorizOp) {
51904 // If either operand is undef, bail out. The binop should be simplified.
51905 if (LHS.isUndef() || RHS.isUndef())
51906 return false;
51907
51908 // Look for the following pattern:
51909 // A = < float a0, float a1, float a2, float a3 >
51910 // B = < float b0, float b1, float b2, float b3 >
51911 // and
51912 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
51913 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
51914 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
51915 // which is A horizontal-op B.
51916
51917 MVT VT = LHS.getSimpleValueType();
51918 assert((VT.is128BitVector() || VT.is256BitVector()) &&
51919 "Unsupported vector type for horizontal add/sub");
51920 unsigned NumElts = VT.getVectorNumElements();
51921
51922 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
51923 SmallVectorImpl<int> &ShuffleMask) {
51924 bool UseSubVector = false;
51925 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
51926 Op.getOperand(0).getValueType().is256BitVector() &&
51927 llvm::isNullConstant(Op.getOperand(1))) {
51928 Op = Op.getOperand(0);
51929 UseSubVector = true;
51930 }
51932 SmallVector<int, 16> SrcMask, ScaledMask;
51934 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
51935 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
51936 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
51937 })) {
51938 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
51939 if (!UseSubVector && SrcOps.size() <= 2 &&
51940 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
51941 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
51942 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
51943 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
51944 }
51945 if (UseSubVector && SrcOps.size() == 1 &&
51946 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
51947 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
51948 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
51949 ShuffleMask.assign(Mask.begin(), Mask.end());
51950 }
51951 }
51952 };
51953
51954 // View LHS in the form
51955 // LHS = VECTOR_SHUFFLE A, B, LMask
51956 // If LHS is not a shuffle, then pretend it is the identity shuffle:
51957 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
51958 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
51959 SDValue A, B;
51961 GetShuffle(LHS, A, B, LMask);
51962
51963 // Likewise, view RHS in the form
51964 // RHS = VECTOR_SHUFFLE C, D, RMask
51965 SDValue C, D;
51967 GetShuffle(RHS, C, D, RMask);
51968
51969 // At least one of the operands should be a vector shuffle.
51970 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
51971 if (NumShuffles == 0)
51972 return false;
51973
51974 if (LMask.empty()) {
51975 A = LHS;
51976 for (unsigned i = 0; i != NumElts; ++i)
51977 LMask.push_back(i);
51978 }
51979
51980 if (RMask.empty()) {
51981 C = RHS;
51982 for (unsigned i = 0; i != NumElts; ++i)
51983 RMask.push_back(i);
51984 }
51985
51986 // If we have an unary mask, ensure the other op is set to null.
51987 if (isUndefOrInRange(LMask, 0, NumElts))
51988 B = SDValue();
51989 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
51990 A = SDValue();
51991
51992 if (isUndefOrInRange(RMask, 0, NumElts))
51993 D = SDValue();
51994 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
51995 C = SDValue();
51996
51997 // If A and B occur in reverse order in RHS, then canonicalize by commuting
51998 // RHS operands and shuffle mask.
51999 if (A != C) {
52000 std::swap(C, D);
52002 }
52003 // Check that the shuffles are both shuffling the same vectors.
52004 if (!(A == C && B == D))
52005 return false;
52006
52007 PostShuffleMask.clear();
52008 PostShuffleMask.append(NumElts, SM_SentinelUndef);
52009
52010 // LHS and RHS are now:
52011 // LHS = shuffle A, B, LMask
52012 // RHS = shuffle A, B, RMask
52013 // Check that the masks correspond to performing a horizontal operation.
52014 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
52015 // so we just repeat the inner loop if this is a 256-bit op.
52016 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
52017 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
52018 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
52019 assert((NumEltsPer128BitChunk % 2 == 0) &&
52020 "Vector type should have an even number of elements in each lane");
52021 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
52022 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
52023 // Ignore undefined components.
52024 int LIdx = LMask[i + j], RIdx = RMask[i + j];
52025 if (LIdx < 0 || RIdx < 0 ||
52026 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
52027 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
52028 continue;
52029
52030 // Check that successive odd/even elements are being operated on. If not,
52031 // this is not a horizontal operation.
52032 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
52033 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
52034 return false;
52035
52036 // Compute the post-shuffle mask index based on where the element
52037 // is stored in the HOP result, and where it needs to be moved to.
52038 int Base = LIdx & ~1u;
52039 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
52040 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
52041
52042 // The low half of the 128-bit result must choose from A.
52043 // The high half of the 128-bit result must choose from B,
52044 // unless B is undef. In that case, we are always choosing from A.
52045 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
52046 Index += NumEltsPer64BitChunk;
52047 PostShuffleMask[i + j] = Index;
52048 }
52049 }
52050
52051 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
52052 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
52053
52054 bool IsIdentityPostShuffle =
52055 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
52056 if (IsIdentityPostShuffle)
52057 PostShuffleMask.clear();
52058
52059 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
52060 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
52061 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
52062 return false;
52063
52064 // If the source nodes are already used in HorizOps then always accept this.
52065 // Shuffle folding should merge these back together.
52066 auto FoundHorizUser = [&](SDNode *User) {
52067 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52068 };
52069 ForceHorizOp =
52070 ForceHorizOp || (llvm::any_of(NewLHS->uses(), FoundHorizUser) &&
52071 llvm::any_of(NewRHS->uses(), FoundHorizUser));
52072
52073 // Assume a SingleSource HOP if we only shuffle one input and don't need to
52074 // shuffle the result.
52075 if (!ForceHorizOp &&
52076 !shouldUseHorizontalOp(NewLHS == NewRHS &&
52077 (NumShuffles < 2 || !IsIdentityPostShuffle),
52078 DAG, Subtarget))
52079 return false;
52080
52081 LHS = DAG.getBitcast(VT, NewLHS);
52082 RHS = DAG.getBitcast(VT, NewRHS);
52083 return true;
52084}
52085
52086// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
52088 const X86Subtarget &Subtarget) {
52089 EVT VT = N->getValueType(0);
52090 unsigned Opcode = N->getOpcode();
52091 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
52092 SmallVector<int, 8> PostShuffleMask;
52093
52094 auto MergableHorizOp = [N](unsigned HorizOpcode) {
52095 return N->hasOneUse() &&
52096 N->use_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
52097 (N->use_begin()->getOperand(0).getOpcode() == HorizOpcode ||
52098 N->use_begin()->getOperand(1).getOpcode() == HorizOpcode);
52099 };
52100
52101 switch (Opcode) {
52102 case ISD::FADD:
52103 case ISD::FSUB:
52104 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
52105 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
52106 SDValue LHS = N->getOperand(0);
52107 SDValue RHS = N->getOperand(1);
52108 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
52109 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52110 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
52111 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
52112 if (!PostShuffleMask.empty())
52113 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52114 DAG.getUNDEF(VT), PostShuffleMask);
52115 return HorizBinOp;
52116 }
52117 }
52118 break;
52119 case ISD::ADD:
52120 case ISD::SUB:
52121 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
52122 VT == MVT::v16i16 || VT == MVT::v8i32)) {
52123 SDValue LHS = N->getOperand(0);
52124 SDValue RHS = N->getOperand(1);
52125 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
52126 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52127 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
52128 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
52129 ArrayRef<SDValue> Ops) {
52130 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
52131 };
52132 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
52133 {LHS, RHS}, HOpBuilder);
52134 if (!PostShuffleMask.empty())
52135 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52136 DAG.getUNDEF(VT), PostShuffleMask);
52137 return HorizBinOp;
52138 }
52139 }
52140 break;
52141 }
52142
52143 return SDValue();
52144}
52145
52146// Try to combine the following nodes
52147// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
52148// <i32 -2147483648[float -0.000000e+00]> 0
52149// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
52150// <(load 4 from constant-pool)> t0, t29
52151// [t30: v16i32 = bitcast t27]
52152// t6: v16i32 = xor t7, t27[t30]
52153// t11: v16f32 = bitcast t6
52154// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
52155// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
52156// t22: v16f32 = bitcast t7
52157// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
52158// t24: v32f16 = bitcast t23
52160 const X86Subtarget &Subtarget) {
52161 EVT VT = N->getValueType(0);
52162 SDValue LHS = N->getOperand(0);
52163 SDValue RHS = N->getOperand(1);
52164 int CombineOpcode =
52165 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
52166 auto combineConjugation = [&](SDValue &r) {
52167 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
52168 SDValue XOR = LHS.getOperand(0);
52169 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
52170 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
52171 if (XORRHS.isConstant()) {
52172 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
52173 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
52174 if ((XORRHS.getBitWidth() == 32 &&
52175 XORRHS.getConstant() == ConjugationInt32) ||
52176 (XORRHS.getBitWidth() == 64 &&
52177 XORRHS.getConstant() == ConjugationInt64)) {
52178 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
52179 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
52180 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
52181 r = DAG.getBitcast(VT, FCMulC);
52182 return true;
52183 }
52184 }
52185 }
52186 }
52187 return false;
52188 };
52189 SDValue Res;
52190 if (combineConjugation(Res))
52191 return Res;
52192 std::swap(LHS, RHS);
52193 if (combineConjugation(Res))
52194 return Res;
52195 return Res;
52196}
52197
52198// Try to combine the following nodes:
52199// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
52201 const X86Subtarget &Subtarget) {
52202 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
52204 Flags.hasAllowContract();
52205 };
52206
52207 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
52208 return DAG.getTarget().Options.NoSignedZerosFPMath ||
52209 Flags.hasNoSignedZeros();
52210 };
52211 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
52212 APInt AI = APInt(32, 0x80008000, true);
52213 KnownBits Bits = DAG.computeKnownBits(Op);
52214 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
52215 Bits.getConstant() == AI;
52216 };
52217
52218 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
52219 !AllowContract(N->getFlags()))
52220 return SDValue();
52221
52222 EVT VT = N->getValueType(0);
52223 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
52224 return SDValue();
52225
52226 SDValue LHS = N->getOperand(0);
52227 SDValue RHS = N->getOperand(1);
52228 bool IsConj;
52229 SDValue FAddOp1, MulOp0, MulOp1;
52230 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
52231 &IsVectorAllNegativeZero,
52232 &HasNoSignedZero](SDValue N) -> bool {
52233 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
52234 return false;
52235 SDValue Op0 = N.getOperand(0);
52236 unsigned Opcode = Op0.getOpcode();
52237 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
52238 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
52239 MulOp0 = Op0.getOperand(0);
52240 MulOp1 = Op0.getOperand(1);
52241 IsConj = Opcode == X86ISD::VFCMULC;
52242 return true;
52243 }
52244 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
52246 HasNoSignedZero(Op0->getFlags())) ||
52247 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
52248 MulOp0 = Op0.getOperand(0);
52249 MulOp1 = Op0.getOperand(1);
52250 IsConj = Opcode == X86ISD::VFCMADDC;
52251 return true;
52252 }
52253 }
52254 return false;
52255 };
52256
52257 if (GetCFmulFrom(LHS))
52258 FAddOp1 = RHS;
52259 else if (GetCFmulFrom(RHS))
52260 FAddOp1 = LHS;
52261 else
52262 return SDValue();
52263
52264 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
52265 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
52266 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
52267 // FIXME: How do we handle when fast math flags of FADD are different from
52268 // CFMUL's?
52269 SDValue CFmul =
52270 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
52271 return DAG.getBitcast(VT, CFmul);
52272}
52273
52274/// Do target-specific dag combines on floating-point adds/subs.
52276 const X86Subtarget &Subtarget) {
52277 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
52278 return HOp;
52279
52280 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
52281 return COp;
52282
52283 return SDValue();
52284}
52285
52287 const X86Subtarget &Subtarget) {
52288 EVT VT = N->getValueType(0);
52289 SDValue Src = N->getOperand(0);
52290 EVT SrcVT = Src.getValueType();
52291 SDLoc DL(N);
52292
52293 if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
52294 SrcVT != MVT::v2f32)
52295 return SDValue();
52296
52297 return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
52298 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
52299 DAG.getUNDEF(SrcVT)));
52300}
52301
52302/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
52303/// the codegen.
52304/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
52305/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
52306/// anything that is guaranteed to be transformed by DAGCombiner.
52308 const X86Subtarget &Subtarget,
52309 const SDLoc &DL) {
52310 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
52311 SDValue Src = N->getOperand(0);
52312 unsigned SrcOpcode = Src.getOpcode();
52313 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52314
52315 EVT VT = N->getValueType(0);
52316 EVT SrcVT = Src.getValueType();
52317
52318 auto IsFreeTruncation = [VT](SDValue Op) {
52319 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
52320
52321 // See if this has been extended from a smaller/equal size to
52322 // the truncation size, allowing a truncation to combine with the extend.
52323 unsigned Opcode = Op.getOpcode();
52324 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
52325 Opcode == ISD::ZERO_EXTEND) &&
52326 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
52327 return true;
52328
52329 // See if this is a single use constant which can be constant folded.
52330 // NOTE: We don't peek throught bitcasts here because there is currently
52331 // no support for constant folding truncate+bitcast+vector_of_constants. So
52332 // we'll just send up with a truncate on both operands which will
52333 // get turned back into (truncate (binop)) causing an infinite loop.
52334 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
52335 };
52336
52337 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
52338 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
52339 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
52340 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
52341 };
52342
52343 // Don't combine if the operation has other uses.
52344 if (!Src.hasOneUse())
52345 return SDValue();
52346
52347 // Only support vector truncation for now.
52348 // TODO: i64 scalar math would benefit as well.
52349 if (!VT.isVector())
52350 return SDValue();
52351
52352 // In most cases its only worth pre-truncating if we're only facing the cost
52353 // of one truncation.
52354 // i.e. if one of the inputs will constant fold or the input is repeated.
52355 switch (SrcOpcode) {
52356 case ISD::MUL:
52357 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
52358 // better to truncate if we have the chance.
52359 if (SrcVT.getScalarType() == MVT::i64 &&
52360 TLI.isOperationLegal(SrcOpcode, VT) &&
52361 !TLI.isOperationLegal(SrcOpcode, SrcVT))
52362 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
52363 [[fallthrough]];
52364 case ISD::AND:
52365 case ISD::XOR:
52366 case ISD::OR:
52367 case ISD::ADD:
52368 case ISD::SUB: {
52369 SDValue Op0 = Src.getOperand(0);
52370 SDValue Op1 = Src.getOperand(1);
52371 if (TLI.isOperationLegal(SrcOpcode, VT) &&
52372 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
52373 return TruncateArithmetic(Op0, Op1);
52374 break;
52375 }
52376 }
52377
52378 return SDValue();
52379}
52380
52381// Try to form a MULHU or MULHS node by looking for
52382// (trunc (srl (mul ext, ext), 16))
52383// TODO: This is X86 specific because we want to be able to handle wide types
52384// before type legalization. But we can only do it if the vector will be
52385// legalized via widening/splitting. Type legalization can't handle promotion
52386// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
52387// combiner.
52388static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
52389 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
52390 // First instruction should be a right shift of a multiply.
52391 if (Src.getOpcode() != ISD::SRL ||
52392 Src.getOperand(0).getOpcode() != ISD::MUL)
52393 return SDValue();
52394
52395 if (!Subtarget.hasSSE2())
52396 return SDValue();
52397
52398 // Only handle vXi16 types that are at least 128-bits unless they will be
52399 // widened.
52400 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
52401 return SDValue();
52402
52403 // Input type should be at least vXi32.
52404 EVT InVT = Src.getValueType();
52405 if (InVT.getVectorElementType().getSizeInBits() < 32)
52406 return SDValue();
52407
52408 // Need a shift by 16.
52409 APInt ShiftAmt;
52410 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
52411 ShiftAmt != 16)
52412 return SDValue();
52413
52414 SDValue LHS = Src.getOperand(0).getOperand(0);
52415 SDValue RHS = Src.getOperand(0).getOperand(1);
52416
52417 // Count leading sign/zero bits on both inputs - if there are enough then
52418 // truncation back to vXi16 will be cheap - either as a pack/shuffle
52419 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
52420 // truncations may actually be free by peeking through to the ext source.
52421 auto IsSext = [&DAG](SDValue V) {
52422 return DAG.ComputeMaxSignificantBits(V) <= 16;
52423 };
52424 auto IsZext = [&DAG](SDValue V) {
52425 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
52426 };
52427
52428 bool IsSigned = IsSext(LHS) && IsSext(RHS);
52429 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
52430 if (!IsSigned && !IsUnsigned)
52431 return SDValue();
52432
52433 // Check if both inputs are extensions, which will be removed by truncation.
52434 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
52435 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
52436 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
52437 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
52438 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
52439 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
52440
52441 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
52442 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
52443 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
52444 // will have to split anyway.
52445 unsigned InSizeInBits = InVT.getSizeInBits();
52446 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
52447 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
52448 (InSizeInBits % 16) == 0) {
52449 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52450 InVT.getSizeInBits() / 16);
52451 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
52452 DAG.getBitcast(BCVT, RHS));
52453 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
52454 }
52455
52456 // Truncate back to source type.
52457 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
52458 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
52459
52460 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
52461 return DAG.getNode(Opc, DL, VT, LHS, RHS);
52462}
52463
52464// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
52465// from one vector with signed bytes from another vector, adds together
52466// adjacent pairs of 16-bit products, and saturates the result before
52467// truncating to 16-bits.
52468//
52469// Which looks something like this:
52470// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
52471// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
52473 const X86Subtarget &Subtarget,
52474 const SDLoc &DL) {
52475 if (!VT.isVector() || !Subtarget.hasSSSE3())
52476 return SDValue();
52477
52478 unsigned NumElems = VT.getVectorNumElements();
52479 EVT ScalarVT = VT.getVectorElementType();
52480 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
52481 return SDValue();
52482
52483 SDValue SSatVal = detectSSatPattern(In, VT);
52484 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
52485 return SDValue();
52486
52487 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
52488 // of multiplies from even/odd elements.
52489 SDValue N0 = SSatVal.getOperand(0);
52490 SDValue N1 = SSatVal.getOperand(1);
52491
52492 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
52493 return SDValue();
52494
52495 SDValue N00 = N0.getOperand(0);
52496 SDValue N01 = N0.getOperand(1);
52497 SDValue N10 = N1.getOperand(0);
52498 SDValue N11 = N1.getOperand(1);
52499
52500 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
52501 // Canonicalize zero_extend to LHS.
52502 if (N01.getOpcode() == ISD::ZERO_EXTEND)
52503 std::swap(N00, N01);
52504 if (N11.getOpcode() == ISD::ZERO_EXTEND)
52505 std::swap(N10, N11);
52506
52507 // Ensure we have a zero_extend and a sign_extend.
52508 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
52509 N01.getOpcode() != ISD::SIGN_EXTEND ||
52510 N10.getOpcode() != ISD::ZERO_EXTEND ||
52511 N11.getOpcode() != ISD::SIGN_EXTEND)
52512 return SDValue();
52513
52514 // Peek through the extends.
52515 N00 = N00.getOperand(0);
52516 N01 = N01.getOperand(0);
52517 N10 = N10.getOperand(0);
52518 N11 = N11.getOperand(0);
52519
52520 // Ensure the extend is from vXi8.
52521 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
52522 N01.getValueType().getVectorElementType() != MVT::i8 ||
52523 N10.getValueType().getVectorElementType() != MVT::i8 ||
52524 N11.getValueType().getVectorElementType() != MVT::i8)
52525 return SDValue();
52526
52527 // All inputs should be build_vectors.
52528 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
52529 N01.getOpcode() != ISD::BUILD_VECTOR ||
52530 N10.getOpcode() != ISD::BUILD_VECTOR ||
52532 return SDValue();
52533
52534 // N00/N10 are zero extended. N01/N11 are sign extended.
52535
52536 // For each element, we need to ensure we have an odd element from one vector
52537 // multiplied by the odd element of another vector and the even element from
52538 // one of the same vectors being multiplied by the even element from the
52539 // other vector. So we need to make sure for each element i, this operator
52540 // is being performed:
52541 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
52542 SDValue ZExtIn, SExtIn;
52543 for (unsigned i = 0; i != NumElems; ++i) {
52544 SDValue N00Elt = N00.getOperand(i);
52545 SDValue N01Elt = N01.getOperand(i);
52546 SDValue N10Elt = N10.getOperand(i);
52547 SDValue N11Elt = N11.getOperand(i);
52548 // TODO: Be more tolerant to undefs.
52549 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52550 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52551 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52553 return SDValue();
52554 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
52555 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
52556 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
52557 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
52558 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
52559 return SDValue();
52560 unsigned IdxN00 = ConstN00Elt->getZExtValue();
52561 unsigned IdxN01 = ConstN01Elt->getZExtValue();
52562 unsigned IdxN10 = ConstN10Elt->getZExtValue();
52563 unsigned IdxN11 = ConstN11Elt->getZExtValue();
52564 // Add is commutative so indices can be reordered.
52565 if (IdxN00 > IdxN10) {
52566 std::swap(IdxN00, IdxN10);
52567 std::swap(IdxN01, IdxN11);
52568 }
52569 // N0 indices be the even element. N1 indices must be the next odd element.
52570 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
52571 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
52572 return SDValue();
52573 SDValue N00In = N00Elt.getOperand(0);
52574 SDValue N01In = N01Elt.getOperand(0);
52575 SDValue N10In = N10Elt.getOperand(0);
52576 SDValue N11In = N11Elt.getOperand(0);
52577 // First time we find an input capture it.
52578 if (!ZExtIn) {
52579 ZExtIn = N00In;
52580 SExtIn = N01In;
52581 }
52582 if (ZExtIn != N00In || SExtIn != N01In ||
52583 ZExtIn != N10In || SExtIn != N11In)
52584 return SDValue();
52585 }
52586
52587 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
52588 EVT ExtVT = Ext.getValueType();
52589 if (ExtVT.getVectorNumElements() != NumElems * 2) {
52590 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
52591 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
52592 DAG.getIntPtrConstant(0, DL));
52593 }
52594 };
52595 ExtractVec(ZExtIn);
52596 ExtractVec(SExtIn);
52597
52598 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
52599 ArrayRef<SDValue> Ops) {
52600 // Shrink by adding truncate nodes and let DAGCombine fold with the
52601 // sources.
52602 EVT InVT = Ops[0].getValueType();
52603 assert(InVT.getScalarType() == MVT::i8 &&
52604 "Unexpected scalar element type");
52605 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
52606 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52607 InVT.getVectorNumElements() / 2);
52608 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
52609 };
52610 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
52611 PMADDBuilder);
52612}
52613
52615 const X86Subtarget &Subtarget) {
52616 EVT VT = N->getValueType(0);
52617 SDValue Src = N->getOperand(0);
52618 SDLoc DL(N);
52619
52620 // Attempt to pre-truncate inputs to arithmetic ops instead.
52621 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
52622 return V;
52623
52624 // Try to detect PMADD
52625 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
52626 return PMAdd;
52627
52628 // Try to combine truncation with signed/unsigned saturation.
52629 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
52630 return Val;
52631
52632 // Try to combine PMULHUW/PMULHW for vXi16.
52633 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
52634 return V;
52635
52636 // The bitcast source is a direct mmx result.
52637 // Detect bitcasts between i32 to x86mmx
52638 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
52639 SDValue BCSrc = Src.getOperand(0);
52640 if (BCSrc.getValueType() == MVT::x86mmx)
52641 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
52642 }
52643
52644 // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
52645 if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
52646 Src.hasOneUse())
52647 return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
52648
52649 return SDValue();
52650}
52651
52654 EVT VT = N->getValueType(0);
52655 SDValue In = N->getOperand(0);
52656 SDLoc DL(N);
52657
52658 if (SDValue SSatVal = detectSSatPattern(In, VT))
52659 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
52660 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
52661 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
52662
52663 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52664 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
52665 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52666 return SDValue(N, 0);
52667
52668 return SDValue();
52669}
52670
52671/// Returns the negated value if the node \p N flips sign of FP value.
52672///
52673/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
52674/// or FSUB(0, x)
52675/// AVX512F does not have FXOR, so FNEG is lowered as
52676/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
52677/// In this case we go though all bitcasts.
52678/// This also recognizes splat of a negated value and returns the splat of that
52679/// value.
52680static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
52681 if (N->getOpcode() == ISD::FNEG)
52682 return N->getOperand(0);
52683
52684 // Don't recurse exponentially.
52686 return SDValue();
52687
52688 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
52689
52691 EVT VT = Op->getValueType(0);
52692
52693 // Make sure the element size doesn't change.
52694 if (VT.getScalarSizeInBits() != ScalarSize)
52695 return SDValue();
52696
52697 unsigned Opc = Op.getOpcode();
52698 switch (Opc) {
52699 case ISD::VECTOR_SHUFFLE: {
52700 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
52701 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
52702 if (!Op.getOperand(1).isUndef())
52703 return SDValue();
52704 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
52705 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
52706 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
52707 cast<ShuffleVectorSDNode>(Op)->getMask());
52708 break;
52709 }
52711 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
52712 // -V, INDEX).
52713 SDValue InsVector = Op.getOperand(0);
52714 SDValue InsVal = Op.getOperand(1);
52715 if (!InsVector.isUndef())
52716 return SDValue();
52717 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
52718 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
52719 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
52720 NegInsVal, Op.getOperand(2));
52721 break;
52722 }
52723 case ISD::FSUB:
52724 case ISD::XOR:
52725 case X86ISD::FXOR: {
52726 SDValue Op1 = Op.getOperand(1);
52727 SDValue Op0 = Op.getOperand(0);
52728
52729 // For XOR and FXOR, we want to check if constant
52730 // bits of Op1 are sign bit masks. For FSUB, we
52731 // have to check if constant bits of Op0 are sign
52732 // bit masks and hence we swap the operands.
52733 if (Opc == ISD::FSUB)
52734 std::swap(Op0, Op1);
52735
52736 APInt UndefElts;
52737 SmallVector<APInt, 16> EltBits;
52738 // Extract constant bits and see if they are all
52739 // sign bit masks. Ignore the undef elements.
52740 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
52741 /* AllowWholeUndefs */ true,
52742 /* AllowPartialUndefs */ false)) {
52743 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
52744 if (!UndefElts[I] && !EltBits[I].isSignMask())
52745 return SDValue();
52746
52747 // Only allow bitcast from correctly-sized constant.
52748 Op0 = peekThroughBitcasts(Op0);
52749 if (Op0.getScalarValueSizeInBits() == ScalarSize)
52750 return Op0;
52751 }
52752 break;
52753 } // case
52754 } // switch
52755
52756 return SDValue();
52757}
52758
52759static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
52760 bool NegRes) {
52761 if (NegMul) {
52762 switch (Opcode) {
52763 // clang-format off
52764 default: llvm_unreachable("Unexpected opcode");
52765 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
52766 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
52767 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
52768 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
52769 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
52770 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
52771 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
52772 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
52773 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
52774 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
52775 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
52776 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
52777 // clang-format on
52778 }
52779 }
52780
52781 if (NegAcc) {
52782 switch (Opcode) {
52783 // clang-format off
52784 default: llvm_unreachable("Unexpected opcode");
52785 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
52786 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
52787 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52788 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
52789 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
52790 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52791 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
52792 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
52793 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52794 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
52795 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
52796 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52797 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
52798 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
52799 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
52800 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
52801 // clang-format on
52802 }
52803 }
52804
52805 if (NegRes) {
52806 switch (Opcode) {
52807 // For accuracy reason, we never combine fneg and fma under strict FP.
52808 // clang-format off
52809 default: llvm_unreachable("Unexpected opcode");
52810 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
52811 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52812 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
52813 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52814 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
52815 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52816 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
52817 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52818 // clang-format on
52819 }
52820 }
52821
52822 return Opcode;
52823}
52824
52825/// Do target-specific dag combines on floating point negations.
52828 const X86Subtarget &Subtarget) {
52829 EVT OrigVT = N->getValueType(0);
52830 SDValue Arg = isFNEG(DAG, N);
52831 if (!Arg)
52832 return SDValue();
52833
52834 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52835 EVT VT = Arg.getValueType();
52836 EVT SVT = VT.getScalarType();
52837 SDLoc DL(N);
52838
52839 // Let legalize expand this if it isn't a legal type yet.
52840 if (!TLI.isTypeLegal(VT))
52841 return SDValue();
52842
52843 // If we're negating a FMUL node on a target with FMA, then we can avoid the
52844 // use of a constant by performing (-0 - A*B) instead.
52845 // FIXME: Check rounding control flags as well once it becomes available.
52846 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
52847 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
52848 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
52849 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
52850 Arg.getOperand(1), Zero);
52851 return DAG.getBitcast(OrigVT, NewNode);
52852 }
52853
52854 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
52855 bool LegalOperations = !DCI.isBeforeLegalizeOps();
52856 if (SDValue NegArg =
52857 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
52858 return DAG.getBitcast(OrigVT, NegArg);
52859
52860 return SDValue();
52861}
52862
52864 bool LegalOperations,
52865 bool ForCodeSize,
52867 unsigned Depth) const {
52868 // fneg patterns are removable even if they have multiple uses.
52869 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
52871 return DAG.getBitcast(Op.getValueType(), Arg);
52872 }
52873
52874 EVT VT = Op.getValueType();
52875 EVT SVT = VT.getScalarType();
52876 unsigned Opc = Op.getOpcode();
52877 SDNodeFlags Flags = Op.getNode()->getFlags();
52878 switch (Opc) {
52879 case ISD::FMA:
52880 case X86ISD::FMSUB:
52881 case X86ISD::FNMADD:
52882 case X86ISD::FNMSUB:
52883 case X86ISD::FMADD_RND:
52884 case X86ISD::FMSUB_RND:
52885 case X86ISD::FNMADD_RND:
52886 case X86ISD::FNMSUB_RND: {
52887 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
52888 !(SVT == MVT::f32 || SVT == MVT::f64) ||
52890 break;
52891
52892 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
52893 // if it may have signed zeros.
52894 if (!Flags.hasNoSignedZeros())
52895 break;
52896
52897 // This is always negatible for free but we might be able to remove some
52898 // extra operand negations as well.
52900 for (int i = 0; i != 3; ++i)
52901 NewOps[i] = getCheaperNegatedExpression(
52902 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
52903
52904 bool NegA = !!NewOps[0];
52905 bool NegB = !!NewOps[1];
52906 bool NegC = !!NewOps[2];
52907 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
52908
52909 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
52911
52912 // Fill in the non-negated ops with the original values.
52913 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
52914 if (!NewOps[i])
52915 NewOps[i] = Op.getOperand(i);
52916 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
52917 }
52918 case X86ISD::FRCP:
52919 if (SDValue NegOp0 =
52920 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
52921 ForCodeSize, Cost, Depth + 1))
52922 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
52923 break;
52924 }
52925
52926 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
52927 ForCodeSize, Cost, Depth);
52928}
52929
52931 const X86Subtarget &Subtarget) {
52932 MVT VT = N->getSimpleValueType(0);
52933 // If we have integer vector types available, use the integer opcodes.
52934 if (!VT.isVector() || !Subtarget.hasSSE2())
52935 return SDValue();
52936
52937 SDLoc dl(N);
52938
52939 unsigned IntBits = VT.getScalarSizeInBits();
52940 MVT IntSVT = MVT::getIntegerVT(IntBits);
52941 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
52942
52943 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
52944 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
52945 unsigned IntOpcode;
52946 switch (N->getOpcode()) {
52947 // clang-format off
52948 default: llvm_unreachable("Unexpected FP logic op");
52949 case X86ISD::FOR: IntOpcode = ISD::OR; break;
52950 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
52951 case X86ISD::FAND: IntOpcode = ISD::AND; break;
52952 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
52953 // clang-format on
52954 }
52955 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
52956 return DAG.getBitcast(VT, IntOp);
52957}
52958
52959
52960/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
52962 if (N->getOpcode() != ISD::XOR)
52963 return SDValue();
52964
52965 SDValue LHS = N->getOperand(0);
52966 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
52967 return SDValue();
52968
52970 X86::CondCode(LHS->getConstantOperandVal(0)));
52971 SDLoc DL(N);
52972 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
52973}
52974
52976 const X86Subtarget &Subtarget) {
52977 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
52978 "Invalid opcode for combing with CTLZ");
52979 if (Subtarget.hasFastLZCNT())
52980 return SDValue();
52981
52982 EVT VT = N->getValueType(0);
52983 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
52984 (VT != MVT::i64 || !Subtarget.is64Bit()))
52985 return SDValue();
52986
52987 SDValue N0 = N->getOperand(0);
52988 SDValue N1 = N->getOperand(1);
52989
52990 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
52992 return SDValue();
52993
52994 SDValue OpCTLZ;
52995 SDValue OpSizeTM1;
52996
52997 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
52998 OpCTLZ = N1;
52999 OpSizeTM1 = N0;
53000 } else if (N->getOpcode() == ISD::SUB) {
53001 return SDValue();
53002 } else {
53003 OpCTLZ = N0;
53004 OpSizeTM1 = N1;
53005 }
53006
53007 if (!OpCTLZ.hasOneUse())
53008 return SDValue();
53009 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
53010 if (!C)
53011 return SDValue();
53012
53013 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
53014 return SDValue();
53015 EVT OpVT = VT;
53016 SDValue Op = OpCTLZ.getOperand(0);
53017 if (VT == MVT::i8) {
53018 // Zero extend to i32 since there is not an i8 bsr.
53019 OpVT = MVT::i32;
53020 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
53021 }
53022
53023 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
53024 Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
53025 if (VT == MVT::i8)
53026 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
53027
53028 return Op;
53029}
53030
53033 const X86Subtarget &Subtarget) {
53034 SDValue N0 = N->getOperand(0);
53035 SDValue N1 = N->getOperand(1);
53036 EVT VT = N->getValueType(0);
53037 SDLoc DL(N);
53038
53039 // If this is SSE1 only convert to FXOR to avoid scalarization.
53040 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
53041 return DAG.getBitcast(MVT::v4i32,
53042 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
53043 DAG.getBitcast(MVT::v4f32, N0),
53044 DAG.getBitcast(MVT::v4f32, N1)));
53045 }
53046
53047 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
53048 return Cmp;
53049
53050 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
53051 return R;
53052
53053 if (SDValue R = combineBitOpWithShift(N, DAG))
53054 return R;
53055
53056 if (SDValue R = combineBitOpWithPACK(N, DAG))
53057 return R;
53058
53059 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
53060 return FPLogic;
53061
53062 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
53063 return R;
53064
53065 if (DCI.isBeforeLegalizeOps())
53066 return SDValue();
53067
53068 if (SDValue SetCC = foldXor1SetCC(N, DAG))
53069 return SetCC;
53070
53071 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
53072 return R;
53073
53074 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
53075 return RV;
53076
53077 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
53078 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53079 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
53080 N0.getOperand(0).getValueType().isVector() &&
53081 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
53082 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
53083 return DAG.getBitcast(
53084 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
53085 }
53086
53087 // Handle AVX512 mask widening.
53088 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
53089 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
53090 VT.getVectorElementType() == MVT::i1 &&
53092 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
53093 return DAG.getNode(
53095 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
53096 N0.getOperand(2));
53097 }
53098
53099 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
53100 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
53101 // TODO: Under what circumstances could this be performed in DAGCombine?
53102 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
53103 N0.getOperand(0).getOpcode() == N->getOpcode()) {
53104 SDValue TruncExtSrc = N0.getOperand(0);
53105 auto *N1C = dyn_cast<ConstantSDNode>(N1);
53106 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
53107 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
53108 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
53109 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
53110 return DAG.getNode(ISD::XOR, DL, VT, LHS,
53111 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
53112 }
53113 }
53114
53115 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
53116 return R;
53117
53118 return combineFneg(N, DAG, DCI, Subtarget);
53119}
53120
53123 const X86Subtarget &Subtarget) {
53124 SDValue N0 = N->getOperand(0);
53125 EVT VT = N->getValueType(0);
53126
53127 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
53128 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
53129 SDValue Src = N0.getOperand(0);
53130 EVT SrcVT = Src.getValueType();
53131 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
53132 (DCI.isBeforeLegalize() ||
53133 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
53134 Subtarget.hasSSSE3()) {
53135 unsigned NumElts = SrcVT.getVectorNumElements();
53136 SmallVector<int, 32> ReverseMask(NumElts);
53137 for (unsigned I = 0; I != NumElts; ++I)
53138 ReverseMask[I] = (NumElts - 1) - I;
53139 SDValue Rev =
53140 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
53141 return DAG.getBitcast(VT, Rev);
53142 }
53143 }
53144
53145 return SDValue();
53146}
53147
53148// Various combines to try to convert to avgceilu.
53151 const X86Subtarget &Subtarget) {
53152 unsigned Opcode = N->getOpcode();
53153 SDValue N0 = N->getOperand(0);
53154 SDValue N1 = N->getOperand(1);
53155 EVT VT = N->getValueType(0);
53156 EVT SVT = VT.getScalarType();
53157 SDLoc DL(N);
53158
53159 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
53160 // Only useful on vXi8 which doesn't have good SRA handling.
53161 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
53163 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
53164 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
53165 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
53166 return DAG.getNode(ISD::XOR, DL, VT,
53167 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
53168 }
53169
53170 return SDValue();
53171}
53172
53175 const X86Subtarget &Subtarget) {
53176 EVT VT = N->getValueType(0);
53177 unsigned NumBits = VT.getSizeInBits();
53178
53179 // TODO - Constant Folding.
53180
53181 // Simplify the inputs.
53182 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53183 APInt DemandedMask(APInt::getAllOnes(NumBits));
53184 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53185 return SDValue(N, 0);
53186
53187 return SDValue();
53188}
53189
53191 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
53192}
53193
53194/// If a value is a scalar FP zero or a vector FP zero (potentially including
53195/// undefined elements), return a zero constant that may be used to fold away
53196/// that value. In the case of a vector, the returned constant will not contain
53197/// undefined elements even if the input parameter does. This makes it suitable
53198/// to be used as a replacement operand with operations (eg, bitwise-and) where
53199/// an undef should not propagate.
53201 const X86Subtarget &Subtarget) {
53203 return SDValue();
53204
53205 if (V.getValueType().isVector())
53206 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
53207
53208 return V;
53209}
53210
53212 const X86Subtarget &Subtarget) {
53213 SDValue N0 = N->getOperand(0);
53214 SDValue N1 = N->getOperand(1);
53215 EVT VT = N->getValueType(0);
53216 SDLoc DL(N);
53217
53218 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
53219 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
53220 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
53221 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
53222 return SDValue();
53223
53224 auto isAllOnesConstantFP = [](SDValue V) {
53225 if (V.getSimpleValueType().isVector())
53226 return ISD::isBuildVectorAllOnes(V.getNode());
53227 auto *C = dyn_cast<ConstantFPSDNode>(V);
53228 return C && C->getConstantFPValue()->isAllOnesValue();
53229 };
53230
53231 // fand (fxor X, -1), Y --> fandn X, Y
53232 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
53233 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
53234
53235 // fand X, (fxor Y, -1) --> fandn Y, X
53236 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
53237 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
53238
53239 return SDValue();
53240}
53241
53242/// Do target-specific dag combines on X86ISD::FAND nodes.
53244 const X86Subtarget &Subtarget) {
53245 // FAND(0.0, x) -> 0.0
53246 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
53247 return V;
53248
53249 // FAND(x, 0.0) -> 0.0
53250 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53251 return V;
53252
53253 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
53254 return V;
53255
53256 return lowerX86FPLogicOp(N, DAG, Subtarget);
53257}
53258
53259/// Do target-specific dag combines on X86ISD::FANDN nodes.
53261 const X86Subtarget &Subtarget) {
53262 // FANDN(0.0, x) -> x
53263 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53264 return N->getOperand(1);
53265
53266 // FANDN(x, 0.0) -> 0.0
53267 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53268 return V;
53269
53270 return lowerX86FPLogicOp(N, DAG, Subtarget);
53271}
53272
53273/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
53276 const X86Subtarget &Subtarget) {
53277 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
53278
53279 // F[X]OR(0.0, x) -> x
53280 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53281 return N->getOperand(1);
53282
53283 // F[X]OR(x, 0.0) -> x
53284 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
53285 return N->getOperand(0);
53286
53287 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
53288 return NewVal;
53289
53290 return lowerX86FPLogicOp(N, DAG, Subtarget);
53291}
53292
53293/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
53295 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
53296
53297 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
53298 if (!DAG.getTarget().Options.NoNaNsFPMath ||
53300 return SDValue();
53301
53302 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
53303 // into FMINC and FMAXC, which are Commutative operations.
53304 unsigned NewOp = 0;
53305 switch (N->getOpcode()) {
53306 default: llvm_unreachable("unknown opcode");
53307 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
53308 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
53309 }
53310
53311 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
53312 N->getOperand(0), N->getOperand(1));
53313}
53314
53316 const X86Subtarget &Subtarget) {
53317 EVT VT = N->getValueType(0);
53318 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
53319 return SDValue();
53320
53321 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53322
53323 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
53324 (Subtarget.hasSSE2() && VT == MVT::f64) ||
53325 (Subtarget.hasFP16() && VT == MVT::f16) ||
53326 (VT.isVector() && TLI.isTypeLegal(VT))))
53327 return SDValue();
53328
53329 SDValue Op0 = N->getOperand(0);
53330 SDValue Op1 = N->getOperand(1);
53331 SDLoc DL(N);
53332 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
53333
53334 // If we don't have to respect NaN inputs, this is a direct translation to x86
53335 // min/max instructions.
53336 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
53337 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53338
53339 // If one of the operands is known non-NaN use the native min/max instructions
53340 // with the non-NaN input as second operand.
53341 if (DAG.isKnownNeverNaN(Op1))
53342 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53343 if (DAG.isKnownNeverNaN(Op0))
53344 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
53345
53346 // If we have to respect NaN inputs, this takes at least 3 instructions.
53347 // Favor a library call when operating on a scalar and minimizing code size.
53348 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
53349 return SDValue();
53350
53351 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
53352 VT);
53353
53354 // There are 4 possibilities involving NaN inputs, and these are the required
53355 // outputs:
53356 // Op1
53357 // Num NaN
53358 // ----------------
53359 // Num | Max | Op0 |
53360 // Op0 ----------------
53361 // NaN | Op1 | NaN |
53362 // ----------------
53363 //
53364 // The SSE FP max/min instructions were not designed for this case, but rather
53365 // to implement:
53366 // Min = Op1 < Op0 ? Op1 : Op0
53367 // Max = Op1 > Op0 ? Op1 : Op0
53368 //
53369 // So they always return Op0 if either input is a NaN. However, we can still
53370 // use those instructions for fmaxnum by selecting away a NaN input.
53371
53372 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
53373 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
53374 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
53375
53376 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
53377 // are NaN, the NaN value of Op1 is the result.
53378 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
53379}
53380
53383 EVT VT = N->getValueType(0);
53384 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53385
53386 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
53387 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
53388 return SDValue(N, 0);
53389
53390 // Convert a full vector load into vzload when not all bits are needed.
53391 SDValue In = N->getOperand(0);
53392 MVT InVT = In.getSimpleValueType();
53393 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53394 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53395 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
53396 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
53397 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53398 MVT MemVT = MVT::getIntegerVT(NumBits);
53399 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53400 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53401 SDLoc dl(N);
53402 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
53403 DAG.getBitcast(InVT, VZLoad));
53404 DCI.CombineTo(N, Convert);
53405 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53407 return SDValue(N, 0);
53408 }
53409 }
53410
53411 return SDValue();
53412}
53413
53416 bool IsStrict = N->isTargetStrictFPOpcode();
53417 EVT VT = N->getValueType(0);
53418
53419 // Convert a full vector load into vzload when not all bits are needed.
53420 SDValue In = N->getOperand(IsStrict ? 1 : 0);
53421 MVT InVT = In.getSimpleValueType();
53422 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53423 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53424 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
53425 LoadSDNode *LN = cast<LoadSDNode>(In);
53426 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53427 MVT MemVT = MVT::getFloatingPointVT(NumBits);
53428 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53429 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53430 SDLoc dl(N);
53431 if (IsStrict) {
53432 SDValue Convert =
53433 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
53434 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
53435 DCI.CombineTo(N, Convert, Convert.getValue(1));
53436 } else {
53437 SDValue Convert =
53438 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
53439 DCI.CombineTo(N, Convert);
53440 }
53441 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53443 return SDValue(N, 0);
53444 }
53445 }
53446
53447 return SDValue();
53448}
53449
53450/// Do target-specific dag combines on X86ISD::ANDNP nodes.
53453 const X86Subtarget &Subtarget) {
53454 SDValue N0 = N->getOperand(0);
53455 SDValue N1 = N->getOperand(1);
53456 MVT VT = N->getSimpleValueType(0);
53457 int NumElts = VT.getVectorNumElements();
53458 unsigned EltSizeInBits = VT.getScalarSizeInBits();
53459 SDLoc DL(N);
53460
53461 // ANDNP(undef, x) -> 0
53462 // ANDNP(x, undef) -> 0
53463 if (N0.isUndef() || N1.isUndef())
53464 return DAG.getConstant(0, DL, VT);
53465
53466 // ANDNP(0, x) -> x
53468 return N1;
53469
53470 // ANDNP(x, 0) -> 0
53472 return DAG.getConstant(0, DL, VT);
53473
53474 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
53476 return DAG.getNOT(DL, N0, VT);
53477
53478 // Turn ANDNP back to AND if input is inverted.
53479 if (SDValue Not = IsNOT(N0, DAG))
53480 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
53481
53482 // Fold for better commutativity:
53483 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
53484 if (N1->hasOneUse())
53485 if (SDValue Not = IsNOT(N1, DAG))
53486 return DAG.getNOT(
53487 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
53488
53489 // Constant Folding
53490 APInt Undefs0, Undefs1;
53491 SmallVector<APInt> EltBits0, EltBits1;
53492 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
53493 /*AllowWholeUndefs*/ true,
53494 /*AllowPartialUndefs*/ true)) {
53495 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
53496 /*AllowWholeUndefs*/ true,
53497 /*AllowPartialUndefs*/ true)) {
53498 SmallVector<APInt> ResultBits;
53499 for (int I = 0; I != NumElts; ++I)
53500 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
53501 return getConstVector(ResultBits, VT, DAG, DL);
53502 }
53503
53504 // Constant fold NOT(N0) to allow us to use AND.
53505 // Ensure this is only performed if we can confirm that the bitcasted source
53506 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
53507 if (N0->hasOneUse()) {
53509 if (BC0.getOpcode() != ISD::BITCAST) {
53510 for (APInt &Elt : EltBits0)
53511 Elt = ~Elt;
53512 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
53513 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
53514 }
53515 }
53516 }
53517
53518 // Attempt to recursively combine a bitmask ANDNP with shuffles.
53519 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
53520 SDValue Op(N, 0);
53521 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
53522 return Res;
53523
53524 // If either operand is a constant mask, then only the elements that aren't
53525 // zero are actually demanded by the other operand.
53526 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
53527 APInt UndefElts;
53528 SmallVector<APInt> EltBits;
53529 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
53530 APInt DemandedElts = APInt::getAllOnes(NumElts);
53531 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
53532 EltBits)) {
53533 DemandedBits.clearAllBits();
53534 DemandedElts.clearAllBits();
53535 for (int I = 0; I != NumElts; ++I) {
53536 if (UndefElts[I]) {
53537 // We can't assume an undef src element gives an undef dst - the
53538 // other src might be zero.
53539 DemandedBits.setAllBits();
53540 DemandedElts.setBit(I);
53541 } else if ((Invert && !EltBits[I].isAllOnes()) ||
53542 (!Invert && !EltBits[I].isZero())) {
53543 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
53544 DemandedElts.setBit(I);
53545 }
53546 }
53547 }
53548 return std::make_pair(DemandedBits, DemandedElts);
53549 };
53550 APInt Bits0, Elts0;
53551 APInt Bits1, Elts1;
53552 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
53553 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
53554
53555 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53556 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
53557 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
53558 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
53559 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
53560 if (N->getOpcode() != ISD::DELETED_NODE)
53561 DCI.AddToWorklist(N);
53562 return SDValue(N, 0);
53563 }
53564 }
53565
53566 return SDValue();
53567}
53568
53571 SDValue N1 = N->getOperand(1);
53572
53573 // BT ignores high bits in the bit index operand.
53574 unsigned BitWidth = N1.getValueSizeInBits();
53576 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
53577 if (N->getOpcode() != ISD::DELETED_NODE)
53578 DCI.AddToWorklist(N);
53579 return SDValue(N, 0);
53580 }
53581
53582 return SDValue();
53583}
53584
53587 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
53588 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
53589
53590 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
53591 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53592 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
53593 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
53594 if (N->getOpcode() != ISD::DELETED_NODE)
53595 DCI.AddToWorklist(N);
53596 return SDValue(N, 0);
53597 }
53598
53599 // Convert a full vector load into vzload when not all bits are needed.
53600 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
53601 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
53602 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
53603 SDLoc dl(N);
53604 if (IsStrict) {
53605 SDValue Convert = DAG.getNode(
53606 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
53607 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
53608 DCI.CombineTo(N, Convert, Convert.getValue(1));
53609 } else {
53610 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
53611 DAG.getBitcast(MVT::v8i16, VZLoad));
53612 DCI.CombineTo(N, Convert);
53613 }
53614
53615 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53617 return SDValue(N, 0);
53618 }
53619 }
53620 }
53621
53622 return SDValue();
53623}
53624
53625// Try to combine sext_in_reg of a cmov of constants by extending the constants.
53627 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
53628
53629 EVT DstVT = N->getValueType(0);
53630
53631 SDValue N0 = N->getOperand(0);
53632 SDValue N1 = N->getOperand(1);
53633 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53634
53635 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
53636 return SDValue();
53637
53638 // Look through single use any_extends / truncs.
53639 SDValue IntermediateBitwidthOp;
53640 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
53641 N0.hasOneUse()) {
53642 IntermediateBitwidthOp = N0;
53643 N0 = N0.getOperand(0);
53644 }
53645
53646 // See if we have a single use cmov.
53647 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
53648 return SDValue();
53649
53650 SDValue CMovOp0 = N0.getOperand(0);
53651 SDValue CMovOp1 = N0.getOperand(1);
53652
53653 // Make sure both operands are constants.
53654 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
53655 !isa<ConstantSDNode>(CMovOp1.getNode()))
53656 return SDValue();
53657
53658 SDLoc DL(N);
53659
53660 // If we looked through an any_extend/trunc above, add one to the constants.
53661 if (IntermediateBitwidthOp) {
53662 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
53663 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
53664 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
53665 }
53666
53667 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
53668 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
53669
53670 EVT CMovVT = DstVT;
53671 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
53672 if (DstVT == MVT::i16) {
53673 CMovVT = MVT::i32;
53674 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
53675 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
53676 }
53677
53678 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
53679 N0.getOperand(2), N0.getOperand(3));
53680
53681 if (CMovVT != DstVT)
53682 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
53683
53684 return CMov;
53685}
53686
53688 const X86Subtarget &Subtarget) {
53689 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
53690
53691 if (SDValue V = combineSextInRegCmov(N, DAG))
53692 return V;
53693
53694 EVT VT = N->getValueType(0);
53695 SDValue N0 = N->getOperand(0);
53696 SDValue N1 = N->getOperand(1);
53697 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53698 SDLoc dl(N);
53699
53700 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
53701 // both SSE and AVX2 since there is no sign-extended shift right
53702 // operation on a vector with 64-bit elements.
53703 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
53704 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
53705 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
53706 N0.getOpcode() == ISD::SIGN_EXTEND)) {
53707 SDValue N00 = N0.getOperand(0);
53708
53709 // EXTLOAD has a better solution on AVX2,
53710 // it may be replaced with X86ISD::VSEXT node.
53711 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
53712 if (!ISD::isNormalLoad(N00.getNode()))
53713 return SDValue();
53714
53715 // Attempt to promote any comparison mask ops before moving the
53716 // SIGN_EXTEND_INREG in the way.
53717 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
53718 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
53719
53720 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
53721 SDValue Tmp =
53722 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
53723 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
53724 }
53725 }
53726 return SDValue();
53727}
53728
53729/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
53730/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
53731/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
53732/// opportunities to combine math ops, use an LEA, or use a complex addressing
53733/// mode. This can eliminate extend, add, and shift instructions.
53735 const X86Subtarget &Subtarget) {
53736 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
53737 Ext->getOpcode() != ISD::ZERO_EXTEND)
53738 return SDValue();
53739
53740 // TODO: This should be valid for other integer types.
53741 EVT VT = Ext->getValueType(0);
53742 if (VT != MVT::i64)
53743 return SDValue();
53744
53745 SDValue Add = Ext->getOperand(0);
53746 if (Add.getOpcode() != ISD::ADD)
53747 return SDValue();
53748
53749 SDValue AddOp0 = Add.getOperand(0);
53750 SDValue AddOp1 = Add.getOperand(1);
53751 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
53752 bool NSW = Add->getFlags().hasNoSignedWrap();
53753 bool NUW = Add->getFlags().hasNoUnsignedWrap();
53754 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
53755 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
53756
53757 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
53758 // into the 'zext'
53759 if ((Sext && !NSW) || (!Sext && !NUW))
53760 return SDValue();
53761
53762 // Having a constant operand to the 'add' ensures that we are not increasing
53763 // the instruction count because the constant is extended for free below.
53764 // A constant operand can also become the displacement field of an LEA.
53765 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
53766 if (!AddOp1C)
53767 return SDValue();
53768
53769 // Don't make the 'add' bigger if there's no hope of combining it with some
53770 // other 'add' or 'shl' instruction.
53771 // TODO: It may be profitable to generate simpler LEA instructions in place
53772 // of single 'add' instructions, but the cost model for selecting an LEA
53773 // currently has a high threshold.
53774 bool HasLEAPotential = false;
53775 for (auto *User : Ext->uses()) {
53776 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
53777 HasLEAPotential = true;
53778 break;
53779 }
53780 }
53781 if (!HasLEAPotential)
53782 return SDValue();
53783
53784 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
53785 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
53786 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
53787 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
53788
53789 // The wider add is guaranteed to not wrap because both operands are
53790 // sign-extended.
53791 SDNodeFlags Flags;
53792 Flags.setNoSignedWrap(NSW);
53793 Flags.setNoUnsignedWrap(NUW);
53794 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
53795}
53796
53797// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
53798// operands and the result of CMOV is not used anywhere else - promote CMOV
53799// itself instead of promoting its result. This could be beneficial, because:
53800// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
53801// (or more) pseudo-CMOVs only when they go one-after-another and
53802// getting rid of result extension code after CMOV will help that.
53803// 2) Promotion of constant CMOV arguments is free, hence the
53804// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
53805// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
53806// promotion is also good in terms of code-size.
53807// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
53808// promotion).
53810 SDValue CMovN = Extend->getOperand(0);
53811 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
53812 return SDValue();
53813
53814 EVT TargetVT = Extend->getValueType(0);
53815 unsigned ExtendOpcode = Extend->getOpcode();
53816 SDLoc DL(Extend);
53817
53818 EVT VT = CMovN.getValueType();
53819 SDValue CMovOp0 = CMovN.getOperand(0);
53820 SDValue CMovOp1 = CMovN.getOperand(1);
53821
53822 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
53823 !isa<ConstantSDNode>(CMovOp1.getNode()))
53824 return SDValue();
53825
53826 // Only extend to i32 or i64.
53827 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
53828 return SDValue();
53829
53830 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
53831 // are free.
53832 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
53833 return SDValue();
53834
53835 // If this a zero extend to i64, we should only extend to i32 and use a free
53836 // zero extend to finish.
53837 EVT ExtendVT = TargetVT;
53838 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
53839 ExtendVT = MVT::i32;
53840
53841 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
53842 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
53843
53844 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
53845 CMovN.getOperand(2), CMovN.getOperand(3));
53846
53847 // Finish extending if needed.
53848 if (ExtendVT != TargetVT)
53849 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
53850
53851 return Res;
53852}
53853
53854// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
53855// result type.
53857 const X86Subtarget &Subtarget) {
53858 SDValue N0 = N->getOperand(0);
53859 EVT VT = N->getValueType(0);
53860 SDLoc dl(N);
53861
53862 // Only do this combine with AVX512 for vector extends.
53863 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
53864 return SDValue();
53865
53866 // Only combine legal element types.
53867 EVT SVT = VT.getVectorElementType();
53868 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
53869 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
53870 return SDValue();
53871
53872 // We don't have CMPP Instruction for vxf16
53873 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
53874 return SDValue();
53875 // We can only do this if the vector size in 256 bits or less.
53876 unsigned Size = VT.getSizeInBits();
53877 if (Size > 256 && Subtarget.useAVX512Regs())
53878 return SDValue();
53879
53880 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
53881 // that's the only integer compares with we have.
53882 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
53884 return SDValue();
53885
53886 // Only do this combine if the extension will be fully consumed by the setcc.
53887 EVT N00VT = N0.getOperand(0).getValueType();
53888 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
53889 if (Size != MatchingVecType.getSizeInBits())
53890 return SDValue();
53891
53892 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
53893
53894 if (N->getOpcode() == ISD::ZERO_EXTEND)
53895 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
53896
53897 return Res;
53898}
53899
53902 const X86Subtarget &Subtarget) {
53903 SDValue N0 = N->getOperand(0);
53904 EVT VT = N->getValueType(0);
53905 SDLoc DL(N);
53906
53907 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53908 if (!DCI.isBeforeLegalizeOps() &&
53910 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
53911 N0->getOperand(1));
53912 bool ReplaceOtherUses = !N0.hasOneUse();
53913 DCI.CombineTo(N, Setcc);
53914 // Replace other uses with a truncate of the widened setcc_carry.
53915 if (ReplaceOtherUses) {
53916 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53917 N0.getValueType(), Setcc);
53918 DCI.CombineTo(N0.getNode(), Trunc);
53919 }
53920
53921 return SDValue(N, 0);
53922 }
53923
53924 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53925 return NewCMov;
53926
53927 if (!DCI.isBeforeLegalizeOps())
53928 return SDValue();
53929
53930 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53931 return V;
53932
53933 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
53934 DAG, DCI, Subtarget))
53935 return V;
53936
53937 if (VT.isVector()) {
53938 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
53939 return R;
53940
53942 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
53943 }
53944
53945 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53946 return NewAdd;
53947
53948 return SDValue();
53949}
53950
53951// Inverting a constant vector is profitable if it can be eliminated and the
53952// inverted vector is already present in DAG. Otherwise, it will be loaded
53953// anyway.
53954//
53955// We determine which of the values can be completely eliminated and invert it.
53956// If both are eliminable, select a vector with the first negative element.
53959 "ConstantFP build vector expected");
53960 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
53961 // can eliminate it. Since this function is invoked for each FMA with this
53962 // vector.
53963 auto IsNotFMA = [](SDNode *Use) {
53964 return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA;
53965 };
53966 if (llvm::any_of(V->uses(), IsNotFMA))
53967 return SDValue();
53968
53970 EVT VT = V.getValueType();
53971 EVT EltVT = VT.getVectorElementType();
53972 for (const SDValue &Op : V->op_values()) {
53973 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53974 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
53975 } else {
53976 assert(Op.isUndef());
53977 Ops.push_back(DAG.getUNDEF(EltVT));
53978 }
53979 }
53980
53981 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
53982 if (!NV)
53983 return SDValue();
53984
53985 // If an inverted version cannot be eliminated, choose it instead of the
53986 // original version.
53987 if (llvm::any_of(NV->uses(), IsNotFMA))
53988 return SDValue(NV, 0);
53989
53990 // If the inverted version also can be eliminated, we have to consistently
53991 // prefer one of the values. We prefer a constant with a negative value on
53992 // the first place.
53993 // N.B. We need to skip undefs that may precede a value.
53994 for (const SDValue &Op : V->op_values()) {
53995 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53996 if (Cst->isNegative())
53997 return SDValue();
53998 break;
53999 }
54000 }
54001 return SDValue(NV, 0);
54002}
54003
54006 const X86Subtarget &Subtarget) {
54007 SDLoc dl(N);
54008 EVT VT = N->getValueType(0);
54009 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
54010
54011 // Let legalize expand this if it isn't a legal type yet.
54012 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54013 if (!TLI.isTypeLegal(VT))
54014 return SDValue();
54015
54016 SDValue A = N->getOperand(IsStrict ? 1 : 0);
54017 SDValue B = N->getOperand(IsStrict ? 2 : 1);
54018 SDValue C = N->getOperand(IsStrict ? 3 : 2);
54019
54020 // If the operation allows fast-math and the target does not support FMA,
54021 // split this into mul+add to avoid libcall(s).
54022 SDNodeFlags Flags = N->getFlags();
54023 if (!IsStrict && Flags.hasAllowReassociation() &&
54024 TLI.isOperationExpand(ISD::FMA, VT)) {
54025 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
54026 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
54027 }
54028
54029 EVT ScalarVT = VT.getScalarType();
54030 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
54031 !Subtarget.hasAnyFMA()) &&
54032 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
54033 return SDValue();
54034
54035 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
54036 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54037 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54038 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
54039 CodeSize)) {
54040 V = NegV;
54041 return true;
54042 }
54043 // Look through extract_vector_elts. If it comes from an FNEG, create a
54044 // new extract from the FNEG input.
54045 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
54046 isNullConstant(V.getOperand(1))) {
54047 SDValue Vec = V.getOperand(0);
54048 if (SDValue NegV = TLI.getCheaperNegatedExpression(
54049 Vec, DAG, LegalOperations, CodeSize)) {
54050 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
54051 NegV, V.getOperand(1));
54052 return true;
54053 }
54054 }
54055 // Lookup if there is an inverted version of constant vector V in DAG.
54056 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
54057 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
54058 V = NegV;
54059 return true;
54060 }
54061 }
54062 return false;
54063 };
54064
54065 // Do not convert the passthru input of scalar intrinsics.
54066 // FIXME: We could allow negations of the lower element only.
54067 bool NegA = invertIfNegative(A);
54068 bool NegB = invertIfNegative(B);
54069 bool NegC = invertIfNegative(C);
54070
54071 if (!NegA && !NegB && !NegC)
54072 return SDValue();
54073
54074 unsigned NewOpcode =
54075 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
54076
54077 // Propagate fast-math-flags to new FMA node.
54078 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
54079 if (IsStrict) {
54080 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
54081 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
54082 {N->getOperand(0), A, B, C});
54083 } else {
54084 if (N->getNumOperands() == 4)
54085 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
54086 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
54087 }
54088}
54089
54090// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
54091// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
54094 SDLoc dl(N);
54095 EVT VT = N->getValueType(0);
54096 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54097 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54098 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54099
54100 SDValue N2 = N->getOperand(2);
54101
54102 SDValue NegN2 =
54103 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
54104 if (!NegN2)
54105 return SDValue();
54106 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
54107
54108 if (N->getNumOperands() == 4)
54109 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54110 NegN2, N->getOperand(3));
54111 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54112 NegN2);
54113}
54114
54117 const X86Subtarget &Subtarget) {
54118 SDLoc dl(N);
54119 SDValue N0 = N->getOperand(0);
54120 EVT VT = N->getValueType(0);
54121
54122 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54123 // FIXME: Is this needed? We don't seem to have any tests for it.
54124 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
54126 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
54127 N0->getOperand(1));
54128 bool ReplaceOtherUses = !N0.hasOneUse();
54129 DCI.CombineTo(N, Setcc);
54130 // Replace other uses with a truncate of the widened setcc_carry.
54131 if (ReplaceOtherUses) {
54132 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
54133 N0.getValueType(), Setcc);
54134 DCI.CombineTo(N0.getNode(), Trunc);
54135 }
54136
54137 return SDValue(N, 0);
54138 }
54139
54140 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
54141 return NewCMov;
54142
54143 if (DCI.isBeforeLegalizeOps())
54144 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
54145 return V;
54146
54147 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
54148 DAG, DCI, Subtarget))
54149 return V;
54150
54151 if (VT.isVector())
54152 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
54153 return R;
54154
54155 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
54156 return NewAdd;
54157
54158 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
54159 return R;
54160
54161 // TODO: Combine with any target/faux shuffle.
54162 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
54164 SDValue N00 = N0.getOperand(0);
54165 SDValue N01 = N0.getOperand(1);
54166 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
54167 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
54168 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
54169 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
54170 return concatSubVectors(N00, N01, DAG, dl);
54171 }
54172 }
54173
54174 return SDValue();
54175}
54176
54177/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
54178/// pre-promote its result type since vXi1 vectors don't get promoted
54179/// during type legalization.
54182 const SDLoc &DL, SelectionDAG &DAG,
54183 const X86Subtarget &Subtarget) {
54184 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
54185 VT.getVectorElementType() == MVT::i1 &&
54186 (OpVT.getVectorElementType() == MVT::i8 ||
54187 OpVT.getVectorElementType() == MVT::i16)) {
54188 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
54189 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
54190 }
54191 return SDValue();
54192}
54193
54196 const X86Subtarget &Subtarget) {
54197 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
54198 const SDValue LHS = N->getOperand(0);
54199 const SDValue RHS = N->getOperand(1);
54200 EVT VT = N->getValueType(0);
54201 EVT OpVT = LHS.getValueType();
54202 SDLoc DL(N);
54203
54204 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
54205 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
54206 Subtarget))
54207 return V;
54208
54209 if (VT == MVT::i1) {
54210 X86::CondCode X86CC;
54211 if (SDValue V =
54212 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
54213 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
54214 }
54215
54216 if (OpVT.isScalarInteger()) {
54217 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
54218 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
54219 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
54220 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
54221 if (N0.getOperand(0) == N1)
54222 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54223 N0.getOperand(1));
54224 if (N0.getOperand(1) == N1)
54225 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54226 N0.getOperand(0));
54227 }
54228 return SDValue();
54229 };
54230 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
54231 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54232 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
54233 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54234
54235 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
54236 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
54237 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
54238 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
54239 if (N0.getOperand(0) == N1)
54240 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54241 DAG.getNOT(DL, N0.getOperand(1), OpVT));
54242 if (N0.getOperand(1) == N1)
54243 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54244 DAG.getNOT(DL, N0.getOperand(0), OpVT));
54245 }
54246 return SDValue();
54247 };
54248 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
54249 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54250 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
54251 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54252
54253 // cmpeq(trunc(x),C) --> cmpeq(x,C)
54254 // cmpne(trunc(x),C) --> cmpne(x,C)
54255 // iff x upper bits are zero.
54256 if (LHS.getOpcode() == ISD::TRUNCATE &&
54257 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
54258 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
54259 EVT SrcVT = LHS.getOperand(0).getValueType();
54261 OpVT.getScalarSizeInBits());
54262 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54263 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
54264 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
54265 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
54266 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
54267 }
54268
54269 // With C as a power of 2 and C != 0 and C != INT_MIN:
54270 // icmp eq Abs(X) C ->
54271 // (icmp eq A, C) | (icmp eq A, -C)
54272 // icmp ne Abs(X) C ->
54273 // (icmp ne A, C) & (icmp ne A, -C)
54274 // Both of these patterns can be better optimized in
54275 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
54276 // integers which is checked above.
54277 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
54278 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
54279 const APInt &CInt = C->getAPIntValue();
54280 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
54281 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
54282 SDValue BaseOp = LHS.getOperand(0);
54283 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
54284 SDValue SETCC1 = DAG.getSetCC(
54285 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
54286 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
54287 SETCC0, SETCC1);
54288 }
54289 }
54290 }
54291 }
54292 }
54293
54294 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
54296 // Using temporaries to avoid messing up operand ordering for later
54297 // transformations if this doesn't work.
54298 SDValue Op0 = LHS;
54299 SDValue Op1 = RHS;
54300 ISD::CondCode TmpCC = CC;
54301 // Put build_vector on the right.
54302 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
54303 std::swap(Op0, Op1);
54304 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
54305 }
54306
54307 bool IsSEXT0 =
54308 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
54309 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
54310 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
54311
54312 if (IsSEXT0 && IsVZero1) {
54313 assert(VT == Op0.getOperand(0).getValueType() &&
54314 "Unexpected operand type");
54315 if (TmpCC == ISD::SETGT)
54316 return DAG.getConstant(0, DL, VT);
54317 if (TmpCC == ISD::SETLE)
54318 return DAG.getConstant(1, DL, VT);
54319 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
54320 return DAG.getNOT(DL, Op0.getOperand(0), VT);
54321
54322 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
54323 "Unexpected condition code!");
54324 return Op0.getOperand(0);
54325 }
54326 }
54327
54328 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
54329 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
54330 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
54331 // a mask, there are signed AVX512 comparisons).
54332 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
54333 bool CanMakeSigned = false;
54335 KnownBits CmpKnown =
54337 // If we know LHS/RHS share the same sign bit at each element we can
54338 // make this signed.
54339 // NOTE: `computeKnownBits` on a vector type aggregates common bits
54340 // across all lanes. So a pattern where the sign varies from lane to
54341 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
54342 // missed. We could get around this by demanding each lane
54343 // independently, but this isn't the most important optimization and
54344 // that may eat into compile time.
54345 CanMakeSigned =
54346 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
54347 }
54348 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
54349 SDValue LHSOut = LHS;
54350 SDValue RHSOut = RHS;
54351 ISD::CondCode NewCC = CC;
54352 switch (CC) {
54353 case ISD::SETGE:
54354 case ISD::SETUGE:
54355 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
54356 /*NSW*/ true))
54357 LHSOut = NewLHS;
54358 else if (SDValue NewRHS = incDecVectorConstant(
54359 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
54360 RHSOut = NewRHS;
54361 else
54362 break;
54363
54364 [[fallthrough]];
54365 case ISD::SETUGT:
54366 NewCC = ISD::SETGT;
54367 break;
54368
54369 case ISD::SETLE:
54370 case ISD::SETULE:
54371 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
54372 /*NSW*/ true))
54373 LHSOut = NewLHS;
54374 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
54375 /*NSW*/ true))
54376 RHSOut = NewRHS;
54377 else
54378 break;
54379
54380 [[fallthrough]];
54381 case ISD::SETULT:
54382 // Will be swapped to SETGT in LowerVSETCC*.
54383 NewCC = ISD::SETLT;
54384 break;
54385 default:
54386 break;
54387 }
54388 if (NewCC != CC) {
54389 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
54390 NewCC, DL, DAG, Subtarget))
54391 return R;
54392 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
54393 }
54394 }
54395 }
54396
54397 if (SDValue R =
54398 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
54399 return R;
54400
54401 // In the middle end transforms:
54402 // `(or (icmp eq X, C), (icmp eq X, C+1))`
54403 // -> `(icmp ult (add x, -C), 2)`
54404 // Likewise inverted cases with `ugt`.
54405 //
54406 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
54407 // in worse codegen. So, undo the middle-end transform and go back to `(or
54408 // (icmp eq), (icmp eq))` form.
54409 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
54410 // the xmm approach.
54411 //
54412 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
54413 // ne))` as it doesn't end up instruction positive.
54414 // TODO: We might want to do this for avx512 as well if we `sext` the result.
54415 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
54416 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
54417 !Subtarget.hasAVX512() &&
54418 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
54419 Subtarget.hasAVX2()) &&
54420 LHS.hasOneUse()) {
54421
54422 APInt CmpC;
54423 SDValue AddC = LHS.getOperand(1);
54424 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
54426 // See which form we have depending on the constant/condition.
54427 SDValue C0 = SDValue();
54428 SDValue C1 = SDValue();
54429
54430 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
54431 // we will end up generating an additional constant. Keeping in the
54432 // current form has a slight latency cost, but it probably worth saving a
54433 // constant.
54436 // Pass
54437 }
54438 // Normal Cases
54439 else if ((CC == ISD::SETULT && CmpC == 2) ||
54440 (CC == ISD::SETULE && CmpC == 1)) {
54441 // These will constant fold.
54442 C0 = DAG.getNegative(AddC, DL, OpVT);
54443 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
54444 DAG.getAllOnesConstant(DL, OpVT));
54445 }
54446 // Inverted Cases
54447 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
54448 (CC == ISD::SETUGE && (-CmpC) == 2)) {
54449 // These will constant fold.
54450 C0 = DAG.getNOT(DL, AddC, OpVT);
54451 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
54452 DAG.getAllOnesConstant(DL, OpVT));
54453 }
54454 if (C0 && C1) {
54455 SDValue NewLHS =
54456 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
54457 SDValue NewRHS =
54458 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
54459 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
54460 }
54461 }
54462 }
54463
54464 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
54465 // to avoid scalarization via legalization because v4i32 is not a legal type.
54466 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
54467 LHS.getValueType() == MVT::v4f32)
54468 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
54469
54470 // X pred 0.0 --> X pred -X
54471 // If the negation of X already exists, use it in the comparison. This removes
54472 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
54473 // instructions in patterns with a 'select' node.
54475 SDVTList FNegVT = DAG.getVTList(OpVT);
54476 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
54477 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
54478 }
54479
54480 return SDValue();
54481}
54482
54485 const X86Subtarget &Subtarget) {
54486 SDValue Src = N->getOperand(0);
54487 MVT SrcVT = Src.getSimpleValueType();
54488 MVT VT = N->getSimpleValueType(0);
54489 unsigned NumBits = VT.getScalarSizeInBits();
54490 unsigned NumElts = SrcVT.getVectorNumElements();
54491 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
54492 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
54493
54494 // Perform constant folding.
54495 APInt UndefElts;
54496 SmallVector<APInt, 32> EltBits;
54497 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
54498 /*AllowWholeUndefs*/ true,
54499 /*AllowPartialUndefs*/ true)) {
54500 APInt Imm(32, 0);
54501 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
54502 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54503 Imm.setBit(Idx);
54504
54505 return DAG.getConstant(Imm, SDLoc(N), VT);
54506 }
54507
54508 // Look through int->fp bitcasts that don't change the element width.
54509 unsigned EltWidth = SrcVT.getScalarSizeInBits();
54510 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
54511 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
54512 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
54513
54514 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
54515 // with scalar comparisons.
54516 if (SDValue NotSrc = IsNOT(Src, DAG)) {
54517 SDLoc DL(N);
54518 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54519 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
54520 return DAG.getNode(ISD::XOR, DL, VT,
54521 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
54522 DAG.getConstant(NotMask, DL, VT));
54523 }
54524
54525 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
54526 // results with scalar comparisons.
54527 if (Src.getOpcode() == X86ISD::PCMPGT &&
54528 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
54529 SDLoc DL(N);
54530 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54531 return DAG.getNode(ISD::XOR, DL, VT,
54532 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
54533 DAG.getConstant(NotMask, DL, VT));
54534 }
54535
54536 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
54537 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
54538 // iff pow2splat(c1).
54539 // Use KnownBits to determine if only a single bit is non-zero
54540 // in each element (pow2 or zero), and shift that bit to the msb.
54541 if (Src.getOpcode() == X86ISD::PCMPEQ) {
54542 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
54543 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
54544 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
54545 if (KnownLHS.countMaxPopulation() == 1 &&
54546 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
54547 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
54548 SDLoc DL(N);
54549 MVT ShiftVT = SrcVT;
54550 SDValue ShiftLHS = Src.getOperand(0);
54551 SDValue ShiftRHS = Src.getOperand(1);
54552 if (ShiftVT.getScalarType() == MVT::i8) {
54553 // vXi8 shifts - we only care about the signbit so can use PSLLW.
54554 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
54555 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
54556 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
54557 }
54558 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54559 ShiftLHS, ShiftAmt, DAG);
54560 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54561 ShiftRHS, ShiftAmt, DAG);
54562 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
54563 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
54564 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
54565 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
54566 }
54567 }
54568
54569 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
54570 if (N->isOnlyUserOf(Src.getNode())) {
54572 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
54573 APInt UndefElts;
54574 SmallVector<APInt, 32> EltBits;
54575 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
54576 UndefElts, EltBits)) {
54577 APInt Mask = APInt::getZero(NumBits);
54578 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
54579 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54580 Mask.setBit(Idx);
54581 }
54582 SDLoc DL(N);
54583 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
54584 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
54585 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
54586 DAG.getConstant(Mask, DL, VT));
54587 }
54588 }
54589 }
54590
54591 // Simplify the inputs.
54592 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54593 APInt DemandedMask(APInt::getAllOnes(NumBits));
54594 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54595 return SDValue(N, 0);
54596
54597 return SDValue();
54598}
54599
54602 const X86Subtarget &Subtarget) {
54603 MVT VT = N->getSimpleValueType(0);
54604 unsigned NumBits = VT.getScalarSizeInBits();
54605
54606 // Simplify the inputs.
54607 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54608 APInt DemandedMask(APInt::getAllOnes(NumBits));
54609 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54610 return SDValue(N, 0);
54611
54612 return SDValue();
54613}
54614
54617 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
54618 SDValue Mask = MemOp->getMask();
54619
54620 // With vector masks we only demand the upper bit of the mask.
54621 if (Mask.getScalarValueSizeInBits() != 1) {
54622 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54623 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54624 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54625 if (N->getOpcode() != ISD::DELETED_NODE)
54626 DCI.AddToWorklist(N);
54627 return SDValue(N, 0);
54628 }
54629 }
54630
54631 return SDValue();
54632}
54633
54636 SelectionDAG &DAG) {
54637 SDLoc DL(GorS);
54638
54639 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
54640 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
54641 Gather->getMask(), Base, Index, Scale } ;
54642 return DAG.getMaskedGather(Gather->getVTList(),
54643 Gather->getMemoryVT(), DL, Ops,
54644 Gather->getMemOperand(),
54645 Gather->getIndexType(),
54646 Gather->getExtensionType());
54647 }
54648 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
54649 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
54650 Scatter->getMask(), Base, Index, Scale };
54651 return DAG.getMaskedScatter(Scatter->getVTList(),
54652 Scatter->getMemoryVT(), DL,
54653 Ops, Scatter->getMemOperand(),
54654 Scatter->getIndexType(),
54655 Scatter->isTruncatingStore());
54656}
54657
54660 SDLoc DL(N);
54661 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
54662 SDValue Index = GorS->getIndex();
54663 SDValue Base = GorS->getBasePtr();
54664 SDValue Scale = GorS->getScale();
54665 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54666
54667 if (DCI.isBeforeLegalize()) {
54668 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54669
54670 // Shrink constant indices if they are larger than 32-bits.
54671 // Only do this before legalize types since v2i64 could become v2i32.
54672 // FIXME: We could check that the type is legal if we're after legalize
54673 // types, but then we would need to construct test cases where that happens.
54674 // FIXME: We could support more than just constant vectors, but we need to
54675 // careful with costing. A truncate that can be optimized out would be fine.
54676 // Otherwise we might only want to create a truncate if it avoids a split.
54677 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
54678 if (BV->isConstant() && IndexWidth > 32 &&
54679 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54680 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
54681 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
54682 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54683 }
54684 }
54685
54686 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
54687 // there are sufficient sign bits. Only do this before legalize types to
54688 // avoid creating illegal types in truncate.
54689 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
54690 Index.getOpcode() == ISD::ZERO_EXTEND) &&
54691 IndexWidth > 32 &&
54692 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
54693 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54694 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
54695 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
54696 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54697 }
54698 }
54699
54700 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
54701 // Try to move splat constant adders from the index operand to the base
54702 // pointer operand. Taking care to multiply by the scale. We can only do
54703 // this when index element type is the same as the pointer type.
54704 // Otherwise we need to be sure the math doesn't wrap before the scale.
54705 if (Index.getOpcode() == ISD::ADD &&
54706 Index.getValueType().getVectorElementType() == PtrVT &&
54707 isa<ConstantSDNode>(Scale)) {
54708 uint64_t ScaleAmt = Scale->getAsZExtVal();
54709 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
54710 BitVector UndefElts;
54711 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
54712 // FIXME: Allow non-constant?
54713 if (UndefElts.none()) {
54714 // Apply the scale.
54715 APInt Adder = C->getAPIntValue() * ScaleAmt;
54716 // Add it to the existing base.
54717 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
54718 DAG.getConstant(Adder, DL, PtrVT));
54719 Index = Index.getOperand(0);
54720 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54721 }
54722 }
54723
54724 // It's also possible base is just a constant. In that case, just
54725 // replace it with 0 and move the displacement into the index.
54726 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
54727 isOneConstant(Scale)) {
54728 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
54729 // Combine the constant build_vector and the constant base.
54730 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
54731 Index.getOperand(1), Splat);
54732 // Add to the LHS of the original Index add.
54733 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
54734 Index.getOperand(0), Splat);
54735 Base = DAG.getConstant(0, DL, Base.getValueType());
54736 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54737 }
54738 }
54739 }
54740
54741 if (DCI.isBeforeLegalizeOps()) {
54742 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54743
54744 // Make sure the index is either i32 or i64
54745 if (IndexWidth != 32 && IndexWidth != 64) {
54746 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
54747 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
54748 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
54749 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54750 }
54751 }
54752
54753 // With vector masks we only demand the upper bit of the mask.
54754 SDValue Mask = GorS->getMask();
54755 if (Mask.getScalarValueSizeInBits() != 1) {
54756 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54757 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54758 if (N->getOpcode() != ISD::DELETED_NODE)
54759 DCI.AddToWorklist(N);
54760 return SDValue(N, 0);
54761 }
54762 }
54763
54764 return SDValue();
54765}
54766
54767// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
54769 const X86Subtarget &Subtarget) {
54770 SDLoc DL(N);
54771 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
54772 SDValue EFLAGS = N->getOperand(1);
54773
54774 // Try to simplify the EFLAGS and condition code operands.
54775 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
54776 return getSETCC(CC, Flags, DL, DAG);
54777
54778 return SDValue();
54779}
54780
54781/// Optimize branch condition evaluation.
54783 const X86Subtarget &Subtarget) {
54784 SDLoc DL(N);
54785 SDValue EFLAGS = N->getOperand(3);
54786 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
54787
54788 // Try to simplify the EFLAGS and condition code operands.
54789 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
54790 // RAUW them under us.
54791 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
54792 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
54793 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
54794 N->getOperand(1), Cond, Flags);
54795 }
54796
54797 return SDValue();
54798}
54799
54800// TODO: Could we move this to DAGCombine?
54802 SelectionDAG &DAG) {
54803 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
54804 // to optimize away operation when it's from a constant.
54805 //
54806 // The general transformation is:
54807 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
54808 // AND(VECTOR_CMP(x,y), constant2)
54809 // constant2 = UNARYOP(constant)
54810
54811 // Early exit if this isn't a vector operation, the operand of the
54812 // unary operation isn't a bitwise AND, or if the sizes of the operations
54813 // aren't the same.
54814 EVT VT = N->getValueType(0);
54815 bool IsStrict = N->isStrictFPOpcode();
54816 unsigned NumEltBits = VT.getScalarSizeInBits();
54817 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54818 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
54819 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
54820 VT.getSizeInBits() != Op0.getValueSizeInBits())
54821 return SDValue();
54822
54823 // Now check that the other operand of the AND is a constant. We could
54824 // make the transformation for non-constant splats as well, but it's unclear
54825 // that would be a benefit as it would not eliminate any operations, just
54826 // perform one more step in scalar code before moving to the vector unit.
54827 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
54828 // Bail out if the vector isn't a constant.
54829 if (!BV->isConstant())
54830 return SDValue();
54831
54832 // Everything checks out. Build up the new and improved node.
54833 SDLoc DL(N);
54834 EVT IntVT = BV->getValueType(0);
54835 // Create a new constant of the appropriate type for the transformed
54836 // DAG.
54837 SDValue SourceConst;
54838 if (IsStrict)
54839 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
54840 {N->getOperand(0), SDValue(BV, 0)});
54841 else
54842 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
54843 // The AND node needs bitcasts to/from an integer vector type around it.
54844 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
54845 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
54846 MaskConst);
54847 SDValue Res = DAG.getBitcast(VT, NewAnd);
54848 if (IsStrict)
54849 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
54850 return Res;
54851 }
54852
54853 return SDValue();
54854}
54855
54856/// If we are converting a value to floating-point, try to replace scalar
54857/// truncate of an extracted vector element with a bitcast. This tries to keep
54858/// the sequence on XMM registers rather than moving between vector and GPRs.
54860 // TODO: This is currently only used by combineSIntToFP, but it is generalized
54861 // to allow being called by any similar cast opcode.
54862 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
54863 SDValue Trunc = N->getOperand(0);
54864 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
54865 return SDValue();
54866
54867 SDValue ExtElt = Trunc.getOperand(0);
54868 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54869 !isNullConstant(ExtElt.getOperand(1)))
54870 return SDValue();
54871
54872 EVT TruncVT = Trunc.getValueType();
54873 EVT SrcVT = ExtElt.getValueType();
54874 unsigned DestWidth = TruncVT.getSizeInBits();
54875 unsigned SrcWidth = SrcVT.getSizeInBits();
54876 if (SrcWidth % DestWidth != 0)
54877 return SDValue();
54878
54879 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
54880 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
54881 unsigned VecWidth = SrcVecVT.getSizeInBits();
54882 unsigned NumElts = VecWidth / DestWidth;
54883 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
54884 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
54885 SDLoc DL(N);
54886 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
54887 BitcastVec, ExtElt.getOperand(1));
54888 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
54889}
54890
54892 const X86Subtarget &Subtarget) {
54893 bool IsStrict = N->isStrictFPOpcode();
54894 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54895 EVT VT = N->getValueType(0);
54896 EVT InVT = Op0.getValueType();
54897
54898 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54899 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54900 // if hasFP16 support:
54901 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
54902 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
54903 // else
54904 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54905 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
54906 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54907 unsigned ScalarSize = InVT.getScalarSizeInBits();
54908 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54909 ScalarSize >= 64)
54910 return SDValue();
54911 SDLoc dl(N);
54912 EVT DstVT =
54914 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54915 : ScalarSize < 32 ? MVT::i32
54916 : MVT::i64,
54917 InVT.getVectorNumElements());
54918 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54919 if (IsStrict)
54920 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54921 {N->getOperand(0), P});
54922 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54923 }
54924
54925 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
54926 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
54927 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
54928 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
54929 VT.getScalarType() != MVT::f16) {
54930 SDLoc dl(N);
54931 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
54932 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54933
54934 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
54935 if (IsStrict)
54936 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54937 {N->getOperand(0), P});
54938 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54939 }
54940
54941 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
54942 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
54943 // the optimization here.
54944 SDNodeFlags Flags = N->getFlags();
54945 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
54946 if (IsStrict)
54947 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
54948 {N->getOperand(0), Op0});
54949 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
54950 }
54951
54952 return SDValue();
54953}
54954
54957 const X86Subtarget &Subtarget) {
54958 // First try to optimize away the conversion entirely when it's
54959 // conditionally from a constant. Vectors only.
54960 bool IsStrict = N->isStrictFPOpcode();
54962 return Res;
54963
54964 // Now move on to more general possibilities.
54965 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54966 EVT VT = N->getValueType(0);
54967 EVT InVT = Op0.getValueType();
54968
54969 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54970 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54971 // if hasFP16 support:
54972 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
54973 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
54974 // else
54975 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54976 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
54977 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54978 unsigned ScalarSize = InVT.getScalarSizeInBits();
54979 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54980 ScalarSize >= 64)
54981 return SDValue();
54982 SDLoc dl(N);
54983 EVT DstVT =
54985 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54986 : ScalarSize < 32 ? MVT::i32
54987 : MVT::i64,
54988 InVT.getVectorNumElements());
54989 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
54990 if (IsStrict)
54991 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54992 {N->getOperand(0), P});
54993 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54994 }
54995
54996 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
54997 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
54998 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
54999 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
55000 VT.getScalarType() != MVT::f16) {
55001 SDLoc dl(N);
55002 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
55003 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
55004 if (IsStrict)
55005 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55006 {N->getOperand(0), P});
55007 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55008 }
55009
55010 // Without AVX512DQ we only support i64 to float scalar conversion. For both
55011 // vectors and scalars, see if we know that the upper bits are all the sign
55012 // bit, in which case we can truncate the input to i32 and convert from that.
55013 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
55014 unsigned BitWidth = InVT.getScalarSizeInBits();
55015 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
55016 if (NumSignBits >= (BitWidth - 31)) {
55017 EVT TruncVT = MVT::i32;
55018 if (InVT.isVector())
55019 TruncVT = InVT.changeVectorElementType(TruncVT);
55020 SDLoc dl(N);
55021 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
55022 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
55023 if (IsStrict)
55024 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55025 {N->getOperand(0), Trunc});
55026 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
55027 }
55028 // If we're after legalize and the type is v2i32 we need to shuffle and
55029 // use CVTSI2P.
55030 assert(InVT == MVT::v2i64 && "Unexpected VT!");
55031 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
55032 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
55033 { 0, 2, -1, -1 });
55034 if (IsStrict)
55035 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
55036 {N->getOperand(0), Shuf});
55037 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
55038 }
55039 }
55040
55041 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
55042 // a 32-bit target where SSE doesn't support i64->FP operations.
55043 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
55044 Op0.getOpcode() == ISD::LOAD) {
55045 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
55046
55047 // This transformation is not supported if the result type is f16 or f128.
55048 if (VT == MVT::f16 || VT == MVT::f128)
55049 return SDValue();
55050
55051 // If we have AVX512DQ we can use packed conversion instructions unless
55052 // the VT is f80.
55053 if (Subtarget.hasDQI() && VT != MVT::f80)
55054 return SDValue();
55055
55056 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
55057 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
55058 std::pair<SDValue, SDValue> Tmp =
55059 Subtarget.getTargetLowering()->BuildFILD(
55060 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
55061 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
55062 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
55063 return Tmp.first;
55064 }
55065 }
55066
55067 if (IsStrict)
55068 return SDValue();
55069
55070 if (SDValue V = combineToFPTruncExtElt(N, DAG))
55071 return V;
55072
55073 return SDValue();
55074}
55075
55077 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
55078
55079 for (const SDNode *User : Flags->uses()) {
55081 switch (User->getOpcode()) {
55082 default:
55083 // Be conservative.
55084 return true;
55085 case X86ISD::SETCC:
55087 CC = (X86::CondCode)User->getConstantOperandVal(0);
55088 break;
55089 case X86ISD::BRCOND:
55090 case X86ISD::CMOV:
55091 CC = (X86::CondCode)User->getConstantOperandVal(2);
55092 break;
55093 }
55094
55095 switch (CC) {
55096 // clang-format off
55097 default: break;
55098 case X86::COND_A: case X86::COND_AE:
55099 case X86::COND_B: case X86::COND_BE:
55100 case X86::COND_O: case X86::COND_NO:
55101 case X86::COND_G: case X86::COND_GE:
55102 case X86::COND_L: case X86::COND_LE:
55103 return true;
55104 // clang-format on
55105 }
55106 }
55107
55108 return false;
55109}
55110
55111static bool onlyZeroFlagUsed(SDValue Flags) {
55112 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
55113
55114 for (const SDNode *User : Flags->uses()) {
55115 unsigned CCOpNo;
55116 switch (User->getOpcode()) {
55117 default:
55118 // Be conservative.
55119 return false;
55120 case X86ISD::SETCC:
55122 CCOpNo = 0;
55123 break;
55124 case X86ISD::BRCOND:
55125 case X86ISD::CMOV:
55126 CCOpNo = 2;
55127 break;
55128 }
55129
55130 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
55131 if (CC != X86::COND_E && CC != X86::COND_NE)
55132 return false;
55133 }
55134
55135 return true;
55136}
55137
55140 const X86Subtarget &Subtarget) {
55141 // Only handle test patterns.
55142 if (!isNullConstant(N->getOperand(1)))
55143 return SDValue();
55144
55145 // If we have a CMP of a truncated binop, see if we can make a smaller binop
55146 // and use its flags directly.
55147 // TODO: Maybe we should try promoting compares that only use the zero flag
55148 // first if we can prove the upper bits with computeKnownBits?
55149 SDLoc dl(N);
55150 SDValue Op = N->getOperand(0);
55151 EVT VT = Op.getValueType();
55152 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55153
55154 if (SDValue CMP =
55155 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
55156 return CMP;
55157
55158 // If we have a constant logical shift that's only used in a comparison
55159 // against zero turn it into an equivalent AND. This allows turning it into
55160 // a TEST instruction later.
55161 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
55162 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
55163 onlyZeroFlagUsed(SDValue(N, 0))) {
55164 unsigned BitWidth = VT.getSizeInBits();
55165 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
55166 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
55167 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
55168 APInt Mask = Op.getOpcode() == ISD::SRL
55169 ? APInt::getHighBitsSet(BitWidth, MaskBits)
55170 : APInt::getLowBitsSet(BitWidth, MaskBits);
55171 if (Mask.isSignedIntN(32)) {
55172 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
55173 DAG.getConstant(Mask, dl, VT));
55174 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55175 DAG.getConstant(0, dl, VT));
55176 }
55177 }
55178 }
55179
55180 // If we're extracting from a avx512 bool vector and comparing against zero,
55181 // then try to just bitcast the vector to an integer to use TEST/BT directly.
55182 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
55183 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
55184 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
55185 SDValue Src = Op.getOperand(0);
55186 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
55187 isNullConstant(Src.getOperand(1)) &&
55188 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
55189 SDValue BoolVec = Src.getOperand(0);
55190 unsigned ShAmt = 0;
55191 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
55192 ShAmt = BoolVec.getConstantOperandVal(1);
55193 BoolVec = BoolVec.getOperand(0);
55194 }
55195 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
55196 EVT VecVT = BoolVec.getValueType();
55197 unsigned BitWidth = VecVT.getVectorNumElements();
55198 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
55199 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
55200 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
55201 Op = DAG.getBitcast(BCVT, BoolVec);
55202 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
55203 DAG.getConstant(Mask, dl, BCVT));
55204 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55205 DAG.getConstant(0, dl, BCVT));
55206 }
55207 }
55208 }
55209
55210 // Peek through any zero-extend if we're only testing for a zero result.
55211 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
55212 SDValue Src = Op.getOperand(0);
55213 EVT SrcVT = Src.getValueType();
55214 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
55215 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
55216 DAG.getConstant(0, dl, SrcVT));
55217 }
55218
55219 // Look for a truncate.
55220 if (Op.getOpcode() != ISD::TRUNCATE)
55221 return SDValue();
55222
55223 SDValue Trunc = Op;
55224 Op = Op.getOperand(0);
55225
55226 // See if we can compare with zero against the truncation source,
55227 // which should help using the Z flag from many ops. Only do this for
55228 // i32 truncated op to prevent partial-reg compares of promoted ops.
55229 EVT OpVT = Op.getValueType();
55230 APInt UpperBits =
55232 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
55233 onlyZeroFlagUsed(SDValue(N, 0))) {
55234 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55235 DAG.getConstant(0, dl, OpVT));
55236 }
55237
55238 // After this the truncate and arithmetic op must have a single use.
55239 if (!Trunc.hasOneUse() || !Op.hasOneUse())
55240 return SDValue();
55241
55242 unsigned NewOpc;
55243 switch (Op.getOpcode()) {
55244 default: return SDValue();
55245 case ISD::AND:
55246 // Skip and with constant. We have special handling for and with immediate
55247 // during isel to generate test instructions.
55248 if (isa<ConstantSDNode>(Op.getOperand(1)))
55249 return SDValue();
55250 NewOpc = X86ISD::AND;
55251 break;
55252 case ISD::OR: NewOpc = X86ISD::OR; break;
55253 case ISD::XOR: NewOpc = X86ISD::XOR; break;
55254 case ISD::ADD:
55255 // If the carry or overflow flag is used, we can't truncate.
55257 return SDValue();
55258 NewOpc = X86ISD::ADD;
55259 break;
55260 case ISD::SUB:
55261 // If the carry or overflow flag is used, we can't truncate.
55263 return SDValue();
55264 NewOpc = X86ISD::SUB;
55265 break;
55266 }
55267
55268 // We found an op we can narrow. Truncate its inputs.
55269 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
55270 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
55271
55272 // Use a X86 specific opcode to avoid DAG combine messing with it.
55273 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55274 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
55275
55276 // For AND, keep a CMP so that we can match the test pattern.
55277 if (NewOpc == X86ISD::AND)
55278 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55279 DAG.getConstant(0, dl, VT));
55280
55281 // Return the flags.
55282 return Op.getValue(1);
55283}
55284
55287 const X86Subtarget &ST) {
55288 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
55289 "Expected X86ISD::ADD or X86ISD::SUB");
55290
55291 SDLoc DL(N);
55292 SDValue LHS = N->getOperand(0);
55293 SDValue RHS = N->getOperand(1);
55294 MVT VT = LHS.getSimpleValueType();
55295 bool IsSub = X86ISD::SUB == N->getOpcode();
55296 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
55297
55298 if (IsSub && isOneConstant(N->getOperand(1)) && !N->hasAnyUseOfValue(0))
55299 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
55300 return CMP;
55301
55302 // If we don't use the flag result, simplify back to a generic ADD/SUB.
55303 if (!N->hasAnyUseOfValue(1)) {
55304 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
55305 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
55306 }
55307
55308 // Fold any similar generic ADD/SUB opcodes to reuse this node.
55309 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
55310 SDValue Ops[] = {N0, N1};
55311 SDVTList VTs = DAG.getVTList(N->getValueType(0));
55312 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
55313 SDValue Op(N, 0);
55314 if (Negate)
55315 Op = DAG.getNegative(Op, DL, VT);
55316 DCI.CombineTo(GenericAddSub, Op);
55317 }
55318 };
55319 MatchGeneric(LHS, RHS, false);
55320 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
55321
55322 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
55323 // EFLAGS result doesn't change.
55324 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
55325 /*ZeroSecondOpOnly*/ true);
55326}
55327
55329 SDValue LHS = N->getOperand(0);
55330 SDValue RHS = N->getOperand(1);
55331 SDValue BorrowIn = N->getOperand(2);
55332
55333 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
55334 MVT VT = N->getSimpleValueType(0);
55335 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55336 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
55337 }
55338
55339 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
55340 // iff the flag result is dead.
55341 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
55342 !N->hasAnyUseOfValue(1))
55343 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55344 LHS.getOperand(1), BorrowIn);
55345
55346 return SDValue();
55347}
55348
55349// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
55352 SDValue LHS = N->getOperand(0);
55353 SDValue RHS = N->getOperand(1);
55354 SDValue CarryIn = N->getOperand(2);
55355 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
55356 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
55357
55358 // Canonicalize constant to RHS.
55359 if (LHSC && !RHSC)
55360 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
55361 CarryIn);
55362
55363 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
55364 // the result is either zero or one (depending on the input carry bit).
55365 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
55366 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
55367 // We don't have a good way to replace an EFLAGS use, so only do this when
55368 // dead right now.
55369 SDValue(N, 1).use_empty()) {
55370 SDLoc DL(N);
55371 EVT VT = N->getValueType(0);
55372 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
55373 SDValue Res1 = DAG.getNode(
55374 ISD::AND, DL, VT,
55376 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
55377 DAG.getConstant(1, DL, VT));
55378 return DCI.CombineTo(N, Res1, CarryOut);
55379 }
55380
55381 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
55382 // iff the flag result is dead.
55383 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
55384 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
55385 SDLoc DL(N);
55386 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
55387 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
55388 DAG.getConstant(0, DL, LHS.getValueType()),
55389 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
55390 }
55391
55392 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
55393 MVT VT = N->getSimpleValueType(0);
55394 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55395 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
55396 }
55397
55398 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
55399 // iff the flag result is dead.
55400 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
55401 !N->hasAnyUseOfValue(1))
55402 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55403 LHS.getOperand(1), CarryIn);
55404
55405 return SDValue();
55406}
55407
55409 const SDLoc &DL, EVT VT,
55410 const X86Subtarget &Subtarget) {
55411 // Example of pattern we try to detect:
55412 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
55413 //(add (build_vector (extract_elt t, 0),
55414 // (extract_elt t, 2),
55415 // (extract_elt t, 4),
55416 // (extract_elt t, 6)),
55417 // (build_vector (extract_elt t, 1),
55418 // (extract_elt t, 3),
55419 // (extract_elt t, 5),
55420 // (extract_elt t, 7)))
55421
55422 if (!Subtarget.hasSSE2())
55423 return SDValue();
55424
55425 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
55427 return SDValue();
55428
55429 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55430 VT.getVectorNumElements() < 4 ||
55432 return SDValue();
55433
55434 // Check if one of Op0,Op1 is of the form:
55435 // (build_vector (extract_elt Mul, 0),
55436 // (extract_elt Mul, 2),
55437 // (extract_elt Mul, 4),
55438 // ...
55439 // the other is of the form:
55440 // (build_vector (extract_elt Mul, 1),
55441 // (extract_elt Mul, 3),
55442 // (extract_elt Mul, 5),
55443 // ...
55444 // and identify Mul.
55445 SDValue Mul;
55446 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
55447 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
55448 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
55449 // TODO: Be more tolerant to undefs.
55450 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55451 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55452 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55453 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
55454 return SDValue();
55455 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
55456 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
55457 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
55458 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
55459 if (!Const0L || !Const1L || !Const0H || !Const1H)
55460 return SDValue();
55461 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
55462 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
55463 // Commutativity of mul allows factors of a product to reorder.
55464 if (Idx0L > Idx1L)
55465 std::swap(Idx0L, Idx1L);
55466 if (Idx0H > Idx1H)
55467 std::swap(Idx0H, Idx1H);
55468 // Commutativity of add allows pairs of factors to reorder.
55469 if (Idx0L > Idx0H) {
55470 std::swap(Idx0L, Idx0H);
55471 std::swap(Idx1L, Idx1H);
55472 }
55473 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
55474 Idx1H != 2 * i + 3)
55475 return SDValue();
55476 if (!Mul) {
55477 // First time an extract_elt's source vector is visited. Must be a MUL
55478 // with 2X number of vector elements than the BUILD_VECTOR.
55479 // Both extracts must be from same MUL.
55480 Mul = Op0L->getOperand(0);
55481 if (Mul->getOpcode() != ISD::MUL ||
55482 Mul.getValueType().getVectorNumElements() != 2 * e)
55483 return SDValue();
55484 }
55485 // Check that the extract is from the same MUL previously seen.
55486 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
55487 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
55488 return SDValue();
55489 }
55490
55491 // Check if the Mul source can be safely shrunk.
55492 ShrinkMode Mode;
55493 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
55494 Mode == ShrinkMode::MULU16)
55495 return SDValue();
55496
55497 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55498 VT.getVectorNumElements() * 2);
55499 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
55500 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
55501
55502 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55503 ArrayRef<SDValue> Ops) {
55504 EVT InVT = Ops[0].getValueType();
55505 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
55506 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55507 InVT.getVectorNumElements() / 2);
55508 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55509 };
55510 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
55511}
55512
55513// Attempt to turn this pattern into PMADDWD.
55514// (add (mul (sext (build_vector)), (sext (build_vector))),
55515// (mul (sext (build_vector)), (sext (build_vector)))
55517 const SDLoc &DL, EVT VT,
55518 const X86Subtarget &Subtarget) {
55519 if (!Subtarget.hasSSE2())
55520 return SDValue();
55521
55522 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
55523 return SDValue();
55524
55525 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55526 VT.getVectorNumElements() < 4 ||
55528 return SDValue();
55529
55530 SDValue N00 = N0.getOperand(0);
55531 SDValue N01 = N0.getOperand(1);
55532 SDValue N10 = N1.getOperand(0);
55533 SDValue N11 = N1.getOperand(1);
55534
55535 // All inputs need to be sign extends.
55536 // TODO: Support ZERO_EXTEND from known positive?
55537 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
55538 N01.getOpcode() != ISD::SIGN_EXTEND ||
55539 N10.getOpcode() != ISD::SIGN_EXTEND ||
55540 N11.getOpcode() != ISD::SIGN_EXTEND)
55541 return SDValue();
55542
55543 // Peek through the extends.
55544 N00 = N00.getOperand(0);
55545 N01 = N01.getOperand(0);
55546 N10 = N10.getOperand(0);
55547 N11 = N11.getOperand(0);
55548
55549 // Must be extending from vXi16.
55550 EVT InVT = N00.getValueType();
55551 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
55552 N10.getValueType() != InVT || N11.getValueType() != InVT)
55553 return SDValue();
55554
55555 // All inputs should be build_vectors.
55556 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
55557 N01.getOpcode() != ISD::BUILD_VECTOR ||
55558 N10.getOpcode() != ISD::BUILD_VECTOR ||
55560 return SDValue();
55561
55562 // For each element, we need to ensure we have an odd element from one vector
55563 // multiplied by the odd element of another vector and the even element from
55564 // one of the same vectors being multiplied by the even element from the
55565 // other vector. So we need to make sure for each element i, this operator
55566 // is being performed:
55567 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
55568 SDValue In0, In1;
55569 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
55570 SDValue N00Elt = N00.getOperand(i);
55571 SDValue N01Elt = N01.getOperand(i);
55572 SDValue N10Elt = N10.getOperand(i);
55573 SDValue N11Elt = N11.getOperand(i);
55574 // TODO: Be more tolerant to undefs.
55575 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55576 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55577 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55579 return SDValue();
55580 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
55581 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
55582 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
55583 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
55584 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
55585 return SDValue();
55586 unsigned IdxN00 = ConstN00Elt->getZExtValue();
55587 unsigned IdxN01 = ConstN01Elt->getZExtValue();
55588 unsigned IdxN10 = ConstN10Elt->getZExtValue();
55589 unsigned IdxN11 = ConstN11Elt->getZExtValue();
55590 // Add is commutative so indices can be reordered.
55591 if (IdxN00 > IdxN10) {
55592 std::swap(IdxN00, IdxN10);
55593 std::swap(IdxN01, IdxN11);
55594 }
55595 // N0 indices be the even element. N1 indices must be the next odd element.
55596 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
55597 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
55598 return SDValue();
55599 SDValue N00In = N00Elt.getOperand(0);
55600 SDValue N01In = N01Elt.getOperand(0);
55601 SDValue N10In = N10Elt.getOperand(0);
55602 SDValue N11In = N11Elt.getOperand(0);
55603
55604 // First time we find an input capture it.
55605 if (!In0) {
55606 In0 = N00In;
55607 In1 = N01In;
55608
55609 // The input vectors must be at least as wide as the output.
55610 // If they are larger than the output, we extract subvector below.
55611 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
55612 In1.getValueSizeInBits() < VT.getSizeInBits())
55613 return SDValue();
55614 }
55615 // Mul is commutative so the input vectors can be in any order.
55616 // Canonicalize to make the compares easier.
55617 if (In0 != N00In)
55618 std::swap(N00In, N01In);
55619 if (In0 != N10In)
55620 std::swap(N10In, N11In);
55621 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
55622 return SDValue();
55623 }
55624
55625 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55626 ArrayRef<SDValue> Ops) {
55627 EVT OpVT = Ops[0].getValueType();
55628 assert(OpVT.getScalarType() == MVT::i16 &&
55629 "Unexpected scalar element type");
55630 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
55631 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55632 OpVT.getVectorNumElements() / 2);
55633 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55634 };
55635
55636 // If the output is narrower than an input, extract the low part of the input
55637 // vector.
55638 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55639 VT.getVectorNumElements() * 2);
55640 if (OutVT16.bitsLT(In0.getValueType())) {
55641 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
55642 DAG.getIntPtrConstant(0, DL));
55643 }
55644 if (OutVT16.bitsLT(In1.getValueType())) {
55645 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
55646 DAG.getIntPtrConstant(0, DL));
55647 }
55648 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
55649 PMADDBuilder);
55650}
55651
55652// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
55653// If upper element in each pair of both VPMADDWD are zero then we can merge
55654// the operand elements and use the implicit add of VPMADDWD.
55655// TODO: Add support for VPMADDUBSW (which isn't commutable).
55657 const SDLoc &DL, EVT VT) {
55658 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
55659 return SDValue();
55660
55661 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
55662 if (VT.getSizeInBits() > 128)
55663 return SDValue();
55664
55665 unsigned NumElts = VT.getVectorNumElements();
55666 MVT OpVT = N0.getOperand(0).getSimpleValueType();
55668 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
55669
55670 bool Op0HiZero =
55671 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
55672 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
55673 bool Op1HiZero =
55674 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
55675 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
55676
55677 // TODO: Check for zero lower elements once we have actual codegen that
55678 // creates them.
55679 if (!Op0HiZero || !Op1HiZero)
55680 return SDValue();
55681
55682 // Create a shuffle mask packing the lower elements from each VPMADDWD.
55683 SmallVector<int> Mask;
55684 for (int i = 0; i != (int)NumElts; ++i) {
55685 Mask.push_back(2 * i);
55686 Mask.push_back(2 * (i + NumElts));
55687 }
55688
55689 SDValue LHS =
55690 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
55691 SDValue RHS =
55692 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
55693 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
55694}
55695
55696/// CMOV of constants requires materializing constant operands in registers.
55697/// Try to fold those constants into an 'add' instruction to reduce instruction
55698/// count. We do this with CMOV rather the generic 'select' because there are
55699/// earlier folds that may be used to turn select-of-constants into logic hacks.
55701 SelectionDAG &DAG,
55702 const X86Subtarget &Subtarget) {
55703 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
55704 // better because we eliminate 1-2 instructions. This transform is still
55705 // an improvement without zero operands because we trade 2 move constants and
55706 // 1 add for 2 adds (LEA) as long as the constants can be represented as
55707 // immediate asm operands (fit in 32-bits).
55708 auto isSuitableCmov = [](SDValue V) {
55709 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
55710 return false;
55711 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
55712 !isa<ConstantSDNode>(V.getOperand(1)))
55713 return false;
55714 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
55715 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
55716 V.getConstantOperandAPInt(1).isSignedIntN(32));
55717 };
55718
55719 // Match an appropriate CMOV as the first operand of the add.
55720 SDValue Cmov = N->getOperand(0);
55721 SDValue OtherOp = N->getOperand(1);
55722 if (!isSuitableCmov(Cmov))
55723 std::swap(Cmov, OtherOp);
55724 if (!isSuitableCmov(Cmov))
55725 return SDValue();
55726
55727 // Don't remove a load folding opportunity for the add. That would neutralize
55728 // any improvements from removing constant materializations.
55729 if (X86::mayFoldLoad(OtherOp, Subtarget))
55730 return SDValue();
55731
55732 EVT VT = N->getValueType(0);
55733 SDValue FalseOp = Cmov.getOperand(0);
55734 SDValue TrueOp = Cmov.getOperand(1);
55735
55736 // We will push the add through the select, but we can potentially do better
55737 // if we know there is another add in the sequence and this is pointer math.
55738 // In that case, we can absorb an add into the trailing memory op and avoid
55739 // a 3-operand LEA which is likely slower than a 2-operand LEA.
55740 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
55741 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
55742 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
55743 all_of(N->uses(), [&](SDNode *Use) {
55744 auto *MemNode = dyn_cast<MemSDNode>(Use);
55745 return MemNode && MemNode->getBasePtr().getNode() == N;
55746 })) {
55747 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
55748 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
55749 // it is possible that choosing op1 might be better.
55750 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
55751 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
55752 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
55753 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
55754 Cmov.getOperand(2), Cmov.getOperand(3));
55755 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
55756 }
55757
55758 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
55759 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
55760 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
55761 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
55762 Cmov.getOperand(3));
55763}
55764
55767 const X86Subtarget &Subtarget) {
55768 EVT VT = N->getValueType(0);
55769 SDValue Op0 = N->getOperand(0);
55770 SDValue Op1 = N->getOperand(1);
55771 SDLoc DL(N);
55772
55773 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
55774 return Select;
55775
55776 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
55777 return MAdd;
55778 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
55779 return MAdd;
55780 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
55781 return MAdd;
55782
55783 // Try to synthesize horizontal adds from adds of shuffles.
55784 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55785 return V;
55786
55787 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
55788 // iff X and Y won't overflow.
55789 if (Op0.getOpcode() == X86ISD::PSADBW && Op1.getOpcode() == X86ISD::PSADBW &&
55792 if (DAG.willNotOverflowAdd(false, Op0.getOperand(0), Op1.getOperand(0))) {
55793 MVT OpVT = Op0.getOperand(1).getSimpleValueType();
55794 SDValue Sum =
55795 DAG.getNode(ISD::ADD, DL, OpVT, Op0.getOperand(0), Op1.getOperand(0));
55796 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
55797 getZeroVector(OpVT, Subtarget, DAG, DL));
55798 }
55799 }
55800
55801 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
55802 // (sub Y, (sext (vXi1 X))).
55803 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
55804 // generic DAG combine without a legal type check, but adding this there
55805 // caused regressions.
55806 if (VT.isVector()) {
55807 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55808 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
55809 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55810 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
55811 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
55812 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
55813 }
55814
55815 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
55816 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55817 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
55818 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
55819 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
55820 }
55821 }
55822
55823 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
55824 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
55825 X86::isZeroNode(Op0.getOperand(1))) {
55826 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
55827 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
55828 Op0.getOperand(0), Op0.getOperand(2));
55829 }
55830
55831 return combineAddOrSubToADCOrSBB(N, DL, DAG);
55832}
55833
55834// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
55835// condition comes from the subtract node that produced -X. This matches the
55836// cmov expansion for absolute value. By swapping the operands we convert abs
55837// to nabs.
55839 SDValue N0 = N->getOperand(0);
55840 SDValue N1 = N->getOperand(1);
55841
55842 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
55843 return SDValue();
55844
55846 if (CC != X86::COND_S && CC != X86::COND_NS)
55847 return SDValue();
55848
55849 // Condition should come from a negate operation.
55850 SDValue Cond = N1.getOperand(3);
55851 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
55852 return SDValue();
55853 assert(Cond.getResNo() == 1 && "Unexpected result number");
55854
55855 // Get the X and -X from the negate.
55856 SDValue NegX = Cond.getValue(0);
55857 SDValue X = Cond.getOperand(1);
55858
55859 SDValue FalseOp = N1.getOperand(0);
55860 SDValue TrueOp = N1.getOperand(1);
55861
55862 // Cmov operands should be X and NegX. Order doesn't matter.
55863 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
55864 return SDValue();
55865
55866 // Build a new CMOV with the operands swapped.
55867 SDLoc DL(N);
55868 MVT VT = N->getSimpleValueType(0);
55869 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
55870 N1.getOperand(2), Cond);
55871 // Convert sub to add.
55872 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
55873}
55874
55876 SDValue Op0 = N->getOperand(0);
55877 SDValue Op1 = N->getOperand(1);
55878
55879 // (sub C (zero_extend (setcc)))
55880 // =>
55881 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
55882 // Don't disturb (sub 0 setcc), which is easily done with neg.
55883 EVT VT = N->getValueType(0);
55884 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
55885 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
55886 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
55887 Op1.getOperand(0).hasOneUse()) {
55888 SDValue SetCC = Op1.getOperand(0);
55891 APInt NewImm = Op0C->getAPIntValue() - 1;
55892 SDLoc DL(Op1);
55893 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
55894 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
55895 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
55896 DAG.getConstant(NewImm, DL, VT));
55897 }
55898
55899 return SDValue();
55900}
55901
55903 // res, flags2 = sub 0, (setcc cc, flag)
55904 // cload/cstore ..., cond_ne, flag2
55905 // ->
55906 // cload/cstore cc, flag
55907 if (N->getConstantOperandVal(3) != X86::COND_NE)
55908 return SDValue();
55909
55910 SDValue Sub = N->getOperand(4);
55911 if (Sub.getOpcode() != X86ISD::SUB)
55912 return SDValue();
55913
55914 SDValue SetCC = Sub.getOperand(1);
55915
55916 if (!X86::isZeroNode(Sub.getOperand(0)) || SetCC.getOpcode() != X86ISD::SETCC)
55917 return SDValue();
55918
55919 SmallVector<SDValue, 5> Ops(N->op_values());
55920 Ops[3] = SetCC.getOperand(0);
55921 Ops[4] = SetCC.getOperand(1);
55922
55923 return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops,
55924 cast<MemSDNode>(N)->getMemoryVT(),
55925 cast<MemSDNode>(N)->getMemOperand());
55926}
55927
55930 const X86Subtarget &Subtarget) {
55931 SDValue Op0 = N->getOperand(0);
55932 SDValue Op1 = N->getOperand(1);
55933 SDLoc DL(N);
55934
55935 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
55936 auto IsNonOpaqueConstant = [&](SDValue Op) {
55938 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
55939 return !Cst->isOpaque();
55940 return true;
55941 }
55942 return false;
55943 };
55944
55945 // X86 can't encode an immediate LHS of a sub. See if we can push the
55946 // negation into a preceding instruction. If the RHS of the sub is a XOR with
55947 // one use and a constant, invert the immediate, saving one register.
55948 // However, ignore cases where C1 is 0, as those will become a NEG.
55949 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
55950 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
55951 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
55952 Op1->hasOneUse()) {
55953 EVT VT = Op0.getValueType();
55954 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
55955 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
55956 SDValue NewAdd =
55957 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
55958 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
55959 }
55960
55961 if (SDValue V = combineSubABS(N, DAG))
55962 return V;
55963
55964 // Try to synthesize horizontal subs from subs of shuffles.
55965 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55966 return V;
55967
55968 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
55969 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
55970 X86::isZeroNode(Op1.getOperand(1))) {
55971 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55972 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
55973 Op1.getOperand(0), Op1.getOperand(2));
55974 }
55975
55976 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
55977 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
55978 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
55979 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
55980 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55981 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
55982 Op1.getOperand(1), Op1.getOperand(2));
55983 return DAG.getNode(ISD::SUB, DL, Op0.getValueType(), ADC.getValue(0),
55984 Op1.getOperand(0));
55985 }
55986
55987 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55988 return V;
55989
55990 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
55991 return V;
55992
55993 return combineSubSetcc(N, DAG);
55994}
55995
55997 const X86Subtarget &Subtarget) {
55998 unsigned Opcode = N->getOpcode();
55999 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
56000 "Unknown PCMP opcode");
56001
56002 SDValue LHS = N->getOperand(0);
56003 SDValue RHS = N->getOperand(1);
56004 MVT VT = N->getSimpleValueType(0);
56005 unsigned EltBits = VT.getScalarSizeInBits();
56006 unsigned NumElts = VT.getVectorNumElements();
56007 SDLoc DL(N);
56008
56009 if (LHS == RHS)
56010 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
56011 : DAG.getConstant(0, DL, VT);
56012
56013 // Constant Folding.
56014 // PCMPEQ(X,UNDEF) -> UNDEF
56015 // PCMPGT(X,UNDEF) -> 0
56016 // PCMPGT(UNDEF,X) -> 0
56017 APInt LHSUndefs, RHSUndefs;
56018 SmallVector<APInt> LHSBits, RHSBits;
56019 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
56020 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
56021 APInt Ones = APInt::getAllOnes(EltBits);
56022 APInt Zero = APInt::getZero(EltBits);
56023 SmallVector<APInt> Results(NumElts);
56024 for (unsigned I = 0; I != NumElts; ++I) {
56025 if (Opcode == X86ISD::PCMPEQ) {
56026 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
56027 } else {
56028 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
56029 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
56030 }
56031 }
56032 if (Opcode == X86ISD::PCMPEQ)
56033 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
56034 return getConstVector(Results, VT, DAG, DL);
56035 }
56036
56037 return SDValue();
56038}
56039
56040// Helper to determine if we can convert an integer comparison to a float
56041// comparison byt casting the operands.
56042static std::optional<unsigned>
56043CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
56044 unsigned NumSignificantBitsRHS) {
56045 MVT SVT = VT.getScalarType();
56046 assert(SVT == MVT::f32 && "Only tested for float so far");
56048 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
56049 "Only PCMPEQ/PCMPGT currently supported");
56050
56051 // TODO: Handle bitcastable integers.
56052
56053 // For cvt + signed compare we need lhs and rhs to be exactly representable as
56054 // a fp value.
56055 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
56056 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
56057 return ISD::SINT_TO_FP;
56058
56059 return std::nullopt;
56060}
56061
56062/// Helper that combines an array of subvector ops as if they were the operands
56063/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
56064/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
56068 const X86Subtarget &Subtarget) {
56069 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
56070 unsigned EltSizeInBits = VT.getScalarSizeInBits();
56071
56072 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
56073 return DAG.getUNDEF(VT);
56074
56075 if (llvm::all_of(Ops, [](SDValue Op) {
56076 return ISD::isBuildVectorAllZeros(Op.getNode());
56077 }))
56078 return getZeroVector(VT, Subtarget, DAG, DL);
56079
56080 SDValue Op0 = Ops[0];
56081 bool IsSplat = llvm::all_equal(Ops);
56082 unsigned NumOps = Ops.size();
56083 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56084 LLVMContext &Ctx = *DAG.getContext();
56085
56086 // Repeated subvectors.
56087 if (IsSplat &&
56088 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
56089 // If this broadcast is inserted into both halves, use a larger broadcast.
56090 if (Op0.getOpcode() == X86ISD::VBROADCAST)
56091 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
56092
56093 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
56094 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
56095 (Subtarget.hasAVX2() ||
56097 VT.getScalarType(), Subtarget)))
56098 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
56099 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
56100 Op0.getOperand(0),
56101 DAG.getIntPtrConstant(0, DL)));
56102
56103 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
56104 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
56105 (Subtarget.hasAVX2() ||
56106 (EltSizeInBits >= 32 &&
56107 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
56108 Op0.getOperand(0).getValueType() == VT.getScalarType())
56109 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
56110
56111 // concat_vectors(extract_subvector(broadcast(x)),
56112 // extract_subvector(broadcast(x))) -> broadcast(x)
56113 // concat_vectors(extract_subvector(subv_broadcast(x)),
56114 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
56115 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56116 Op0.getOperand(0).getValueType() == VT) {
56117 SDValue SrcVec = Op0.getOperand(0);
56118 if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
56120 return Op0.getOperand(0);
56121 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
56122 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
56123 return Op0.getOperand(0);
56124 }
56125
56126 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
56127 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
56128 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
56129 return DAG.getNode(Op0.getOpcode(), DL, VT,
56131 Op0.getOperand(0), Op0.getOperand(0)),
56132 Op0.getOperand(1));
56133 }
56134
56135 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
56136 // Only concat of subvector high halves which vperm2x128 is best at.
56137 // TODO: This should go in combineX86ShufflesRecursively eventually.
56138 if (VT.is256BitVector() && NumOps == 2) {
56139 SDValue Src0 = peekThroughBitcasts(Ops[0]);
56140 SDValue Src1 = peekThroughBitcasts(Ops[1]);
56141 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56143 EVT SrcVT0 = Src0.getOperand(0).getValueType();
56144 EVT SrcVT1 = Src1.getOperand(0).getValueType();
56145 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
56146 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
56147 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
56148 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
56149 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
56150 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
56151 DAG.getBitcast(VT, Src0.getOperand(0)),
56152 DAG.getBitcast(VT, Src1.getOperand(0)),
56153 DAG.getTargetConstant(0x31, DL, MVT::i8));
56154 }
56155 }
56156 }
56157
56158 // Repeated opcode.
56159 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
56160 // but it currently struggles with different vector widths.
56161 if (llvm::all_of(Ops, [Op0](SDValue Op) {
56162 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
56163 })) {
56164 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
56166 for (SDValue SubOp : SubOps)
56167 Subs.push_back(SubOp.getOperand(I));
56168 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
56169 };
56170 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
56171 bool AllConstants = true;
56172 bool AllSubVectors = true;
56173 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
56174 SDValue Sub = SubOps[I].getOperand(Op);
56175 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
56176 SDValue BC = peekThroughBitcasts(Sub);
56177 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
56179 AllSubVectors &= Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56180 Sub.getOperand(0).getValueType() == VT &&
56181 Sub.getConstantOperandAPInt(1) == (I * NumSubElts);
56182 }
56183 return AllConstants || AllSubVectors;
56184 };
56185
56186 switch (Op0.getOpcode()) {
56187 case X86ISD::VBROADCAST: {
56188 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
56189 return Op.getOperand(0).getValueType().is128BitVector();
56190 })) {
56191 if (VT == MVT::v4f64 || VT == MVT::v4i64)
56192 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
56193 ConcatSubOperand(VT, Ops, 0),
56194 ConcatSubOperand(VT, Ops, 0));
56195 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
56196 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
56197 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
56199 DL, VT, ConcatSubOperand(VT, Ops, 0),
56200 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
56201 }
56202 break;
56203 }
56204 case X86ISD::MOVDDUP:
56205 case X86ISD::MOVSHDUP:
56206 case X86ISD::MOVSLDUP: {
56207 if (!IsSplat)
56208 return DAG.getNode(Op0.getOpcode(), DL, VT,
56209 ConcatSubOperand(VT, Ops, 0));
56210 break;
56211 }
56212 case X86ISD::SHUFP: {
56213 // Add SHUFPD support if/when necessary.
56214 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
56215 llvm::all_of(Ops, [Op0](SDValue Op) {
56216 return Op.getOperand(2) == Op0.getOperand(2);
56217 })) {
56218 return DAG.getNode(Op0.getOpcode(), DL, VT,
56219 ConcatSubOperand(VT, Ops, 0),
56220 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56221 }
56222 break;
56223 }
56224 case X86ISD::UNPCKH:
56225 case X86ISD::UNPCKL: {
56226 // Don't concatenate build_vector patterns.
56227 if (!IsSplat && EltSizeInBits >= 32 &&
56228 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56229 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56230 none_of(Ops, [](SDValue Op) {
56231 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
56233 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
56235 })) {
56236 return DAG.getNode(Op0.getOpcode(), DL, VT,
56237 ConcatSubOperand(VT, Ops, 0),
56238 ConcatSubOperand(VT, Ops, 1));
56239 }
56240 break;
56241 }
56242 case X86ISD::PSHUFHW:
56243 case X86ISD::PSHUFLW:
56244 case X86ISD::PSHUFD:
56245 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
56246 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
56247 return DAG.getNode(Op0.getOpcode(), DL, VT,
56248 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56249 }
56250 [[fallthrough]];
56251 case X86ISD::VPERMILPI:
56252 if (!IsSplat && EltSizeInBits == 32 &&
56253 (VT.is256BitVector() ||
56254 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56255 all_of(Ops, [&Op0](SDValue Op) {
56256 return Op0.getOperand(1) == Op.getOperand(1);
56257 })) {
56258 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
56259 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
56260 Res =
56261 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
56262 return DAG.getBitcast(VT, Res);
56263 }
56264 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
56265 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
56266 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
56267 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
56268 return DAG.getNode(Op0.getOpcode(), DL, VT,
56269 ConcatSubOperand(VT, Ops, 0),
56270 DAG.getTargetConstant(Idx, DL, MVT::i8));
56271 }
56272 break;
56273 case X86ISD::PSHUFB:
56274 case X86ISD::PSADBW:
56275 case X86ISD::VPMADDUBSW:
56276 case X86ISD::VPMADDWD:
56277 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56278 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
56279 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56280 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56281 NumOps * SrcVT.getVectorNumElements());
56282 return DAG.getNode(Op0.getOpcode(), DL, VT,
56283 ConcatSubOperand(SrcVT, Ops, 0),
56284 ConcatSubOperand(SrcVT, Ops, 1));
56285 }
56286 break;
56287 case X86ISD::VPERMV:
56288 if (!IsSplat && NumOps == 2 &&
56289 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
56290 MVT OpVT = Op0.getSimpleValueType();
56291 int NumSrcElts = OpVT.getVectorNumElements();
56292 SmallVector<int, 64> ConcatMask;
56293 for (unsigned i = 0; i != NumOps; ++i) {
56294 SmallVector<int, 64> SubMask;
56296 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
56297 break;
56298 for (int M : SubMask) {
56299 if (0 <= M)
56300 M += i * NumSrcElts;
56301 ConcatMask.push_back(M);
56302 }
56303 }
56304 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56305 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
56306 Ops[1].getOperand(1), DAG, DL);
56307 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56308 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56309 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56310 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
56311 }
56312 }
56313 break;
56314 case X86ISD::VPERMV3:
56315 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
56316 MVT OpVT = Op0.getSimpleValueType();
56317 int NumSrcElts = OpVT.getVectorNumElements();
56318 SmallVector<int, 64> ConcatMask;
56319 for (unsigned i = 0; i != NumOps; ++i) {
56320 SmallVector<int, 64> SubMask;
56322 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
56323 break;
56324 for (int M : SubMask) {
56325 if (0 <= M) {
56326 M += M < NumSrcElts ? 0 : NumSrcElts;
56327 M += i * NumSrcElts;
56328 }
56329 ConcatMask.push_back(M);
56330 }
56331 }
56332 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56333 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
56334 Ops[1].getOperand(0), DAG, DL);
56335 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
56336 Ops[1].getOperand(2), DAG, DL);
56337 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56338 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56339 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56340 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
56341 }
56342 }
56343 break;
56344 case X86ISD::VPERM2X128: {
56345 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
56346 assert(NumOps == 2 && "Bad concat_vectors operands");
56347 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
56348 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
56349 // TODO: Handle zero'd subvectors.
56350 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
56351 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
56352 (int)((Imm1 >> 4) & 0x3)};
56353 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
56354 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
56355 Ops[0].getOperand(1), DAG, DL);
56356 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
56357 Ops[1].getOperand(1), DAG, DL);
56358 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
56359 DAG.getBitcast(ShuffleVT, LHS),
56360 DAG.getBitcast(ShuffleVT, RHS),
56361 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
56362 return DAG.getBitcast(VT, Res);
56363 }
56364 }
56365 break;
56366 }
56367 case X86ISD::SHUF128: {
56368 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
56369 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
56370 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
56371 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
56372 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
56373 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
56374 Ops[0].getOperand(1), DAG, DL);
56375 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
56376 Ops[1].getOperand(1), DAG, DL);
56377 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
56378 DAG.getTargetConstant(Imm, DL, MVT::i8));
56379 }
56380 break;
56381 }
56382 case ISD::TRUNCATE:
56383 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
56384 EVT SrcVT = Ops[0].getOperand(0).getValueType();
56385 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
56386 SrcVT == Ops[1].getOperand(0).getValueType() &&
56387 Subtarget.useAVX512Regs() &&
56388 Subtarget.getPreferVectorWidth() >= 512 &&
56389 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
56390 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
56391 return DAG.getNode(ISD::TRUNCATE, DL, VT,
56392 ConcatSubOperand(NewSrcVT, Ops, 0));
56393 }
56394 }
56395 break;
56396 case X86ISD::VSHLI:
56397 case X86ISD::VSRLI:
56398 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
56399 // TODO: Move this to LowerShiftByScalarImmediate?
56400 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
56401 llvm::all_of(Ops, [](SDValue Op) {
56402 return Op.getConstantOperandAPInt(1) == 32;
56403 })) {
56404 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
56405 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
56406 if (Op0.getOpcode() == X86ISD::VSHLI) {
56407 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56408 {8, 0, 8, 2, 8, 4, 8, 6});
56409 } else {
56410 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56411 {1, 8, 3, 8, 5, 8, 7, 8});
56412 }
56413 return DAG.getBitcast(VT, Res);
56414 }
56415 [[fallthrough]];
56416 case X86ISD::VSRAI:
56417 case X86ISD::VSHL:
56418 case X86ISD::VSRL:
56419 case X86ISD::VSRA:
56420 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
56421 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56422 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
56423 llvm::all_of(Ops, [Op0](SDValue Op) {
56424 return Op0.getOperand(1) == Op.getOperand(1);
56425 })) {
56426 return DAG.getNode(Op0.getOpcode(), DL, VT,
56427 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56428 }
56429 break;
56430 case X86ISD::VPERMI:
56431 case X86ISD::VROTLI:
56432 case X86ISD::VROTRI:
56433 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56434 llvm::all_of(Ops, [Op0](SDValue Op) {
56435 return Op0.getOperand(1) == Op.getOperand(1);
56436 })) {
56437 return DAG.getNode(Op0.getOpcode(), DL, VT,
56438 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56439 }
56440 break;
56441 case ISD::AND:
56442 case ISD::OR:
56443 case ISD::XOR:
56444 case X86ISD::ANDNP:
56445 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56446 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56447 return DAG.getNode(Op0.getOpcode(), DL, VT,
56448 ConcatSubOperand(VT, Ops, 0),
56449 ConcatSubOperand(VT, Ops, 1));
56450 }
56451 break;
56452 case X86ISD::PCMPEQ:
56453 case X86ISD::PCMPGT:
56454 if (!IsSplat && VT.is256BitVector() &&
56455 (Subtarget.hasInt256() || VT == MVT::v8i32) &&
56456 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
56457 if (Subtarget.hasInt256())
56458 return DAG.getNode(Op0.getOpcode(), DL, VT,
56459 ConcatSubOperand(VT, Ops, 0),
56460 ConcatSubOperand(VT, Ops, 1));
56461
56462 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
56463 // TODO: Handle v4f64 as well?
56464 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
56465 for (unsigned I = 0; I != NumOps; ++I) {
56466 MaxSigBitsLHS =
56467 std::max(MaxSigBitsLHS,
56468 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
56469 MaxSigBitsRHS =
56470 std::max(MaxSigBitsRHS,
56471 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
56472 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
56473 break;
56474 }
56475
56476 ISD::CondCode ICC =
56478 ISD::CondCode FCC =
56480
56481 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
56482 MVT FpVT = VT.changeVectorElementType(FpSVT);
56483
56484 if (std::optional<unsigned> CastOpc =
56485 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
56486 SDValue LHS = ConcatSubOperand(VT, Ops, 0);
56487 SDValue RHS = ConcatSubOperand(VT, Ops, 1);
56488 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
56489 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
56490
56491 bool IsAlwaysSignaling;
56492 unsigned FSETCC =
56493 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
56494 return DAG.getBitcast(
56495 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
56496 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
56497 }
56498 }
56499 break;
56500 case ISD::CTPOP:
56501 case ISD::CTTZ:
56502 case ISD::CTLZ:
56505 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56506 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
56507 return DAG.getNode(Op0.getOpcode(), DL, VT,
56508 ConcatSubOperand(VT, Ops, 0));
56509 }
56510 break;
56512 if (!IsSplat &&
56513 (VT.is256BitVector() ||
56514 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56515 llvm::all_of(Ops, [Op0](SDValue Op) {
56516 return Op0.getOperand(2) == Op.getOperand(2);
56517 })) {
56518 return DAG.getNode(Op0.getOpcode(), DL, VT,
56519 ConcatSubOperand(VT, Ops, 0),
56520 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56521 }
56522 break;
56523 case ISD::ADD:
56524 case ISD::SUB:
56525 case ISD::MUL:
56526 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56527 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56528 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
56529 return DAG.getNode(Op0.getOpcode(), DL, VT,
56530 ConcatSubOperand(VT, Ops, 0),
56531 ConcatSubOperand(VT, Ops, 1));
56532 }
56533 break;
56534 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
56535 // their latency are short, so here we don't replace them unless we won't
56536 // introduce extra VINSERT.
56537 case ISD::FADD:
56538 case ISD::FSUB:
56539 case ISD::FMUL:
56540 if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
56541 (VT.is256BitVector() ||
56542 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56543 return DAG.getNode(Op0.getOpcode(), DL, VT,
56544 ConcatSubOperand(VT, Ops, 0),
56545 ConcatSubOperand(VT, Ops, 1));
56546 }
56547 break;
56548 case ISD::FDIV:
56549 if (!IsSplat && (VT.is256BitVector() ||
56550 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56551 return DAG.getNode(Op0.getOpcode(), DL, VT,
56552 ConcatSubOperand(VT, Ops, 0),
56553 ConcatSubOperand(VT, Ops, 1));
56554 }
56555 break;
56556 case X86ISD::HADD:
56557 case X86ISD::HSUB:
56558 case X86ISD::FHADD:
56559 case X86ISD::FHSUB:
56560 if (!IsSplat && VT.is256BitVector() &&
56561 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
56562 return DAG.getNode(Op0.getOpcode(), DL, VT,
56563 ConcatSubOperand(VT, Ops, 0),
56564 ConcatSubOperand(VT, Ops, 1));
56565 }
56566 break;
56567 case X86ISD::PACKSS:
56568 case X86ISD::PACKUS:
56569 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56570 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
56571 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56572 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56573 NumOps * SrcVT.getVectorNumElements());
56574 return DAG.getNode(Op0.getOpcode(), DL, VT,
56575 ConcatSubOperand(SrcVT, Ops, 0),
56576 ConcatSubOperand(SrcVT, Ops, 1));
56577 }
56578 break;
56579 case X86ISD::PALIGNR:
56580 if (!IsSplat &&
56581 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56582 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
56583 llvm::all_of(Ops, [Op0](SDValue Op) {
56584 return Op0.getOperand(2) == Op.getOperand(2);
56585 })) {
56586 return DAG.getNode(Op0.getOpcode(), DL, VT,
56587 ConcatSubOperand(VT, Ops, 0),
56588 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56589 }
56590 break;
56591 case X86ISD::BLENDI:
56592 if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) {
56593 uint64_t Mask0 = Ops[0].getConstantOperandVal(2);
56594 uint64_t Mask1 = Ops[1].getConstantOperandVal(2);
56595 // MVT::v16i16 has repeated blend mask.
56596 if (Op0.getSimpleValueType() == MVT::v16i16) {
56597 Mask0 = (Mask0 << 8) | Mask0;
56598 Mask1 = (Mask1 << 8) | Mask1;
56599 }
56600 uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0;
56602 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
56603 SDValue Sel =
56604 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
56605 return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1),
56606 ConcatSubOperand(VT, Ops, 0));
56607 }
56608 break;
56609 case ISD::VSELECT:
56610 if (!IsSplat && Subtarget.hasAVX512() &&
56611 (VT.is256BitVector() ||
56612 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56613 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
56614 EVT SelVT = Ops[0].getOperand(0).getValueType();
56615 if (SelVT.getVectorElementType() == MVT::i1) {
56616 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
56617 NumOps * SelVT.getVectorNumElements());
56618 if (TLI.isTypeLegal(SelVT))
56619 return DAG.getNode(Op0.getOpcode(), DL, VT,
56620 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56621 ConcatSubOperand(VT, Ops, 1),
56622 ConcatSubOperand(VT, Ops, 2));
56623 }
56624 }
56625 [[fallthrough]];
56626 case X86ISD::BLENDV:
56627 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
56628 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
56629 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
56630 EVT SelVT = Ops[0].getOperand(0).getValueType();
56631 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
56632 if (TLI.isTypeLegal(SelVT))
56633 return DAG.getNode(Op0.getOpcode(), DL, VT,
56634 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56635 ConcatSubOperand(VT, Ops, 1),
56636 ConcatSubOperand(VT, Ops, 2));
56637 }
56638 break;
56639 }
56640 }
56641
56642 // Fold subvector loads into one.
56643 // If needed, look through bitcasts to get to the load.
56644 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
56645 unsigned Fast;
56646 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
56647 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
56648 *FirstLd->getMemOperand(), &Fast) &&
56649 Fast) {
56650 if (SDValue Ld =
56651 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
56652 return Ld;
56653 }
56654 }
56655
56656 // Attempt to fold target constant loads.
56657 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
56658 SmallVector<APInt> EltBits;
56659 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
56660 for (unsigned I = 0; I != NumOps; ++I) {
56661 APInt OpUndefElts;
56662 SmallVector<APInt> OpEltBits;
56663 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
56664 OpEltBits, /*AllowWholeUndefs*/ true,
56665 /*AllowPartialUndefs*/ false))
56666 break;
56667 EltBits.append(OpEltBits);
56668 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
56669 }
56670 if (EltBits.size() == VT.getVectorNumElements()) {
56671 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
56672 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
56673 SDValue CV = DAG.getConstantPool(C, PVT);
56676 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
56677 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
56678 DAG.ReplaceAllUsesOfValueWith(Op0, Sub);
56679 return Ld;
56680 }
56681 }
56682
56683 // If this simple subvector or scalar/subvector broadcast_load is inserted
56684 // into both halves, use a larger broadcast_load. Update other uses to use
56685 // an extracted subvector.
56686 if (IsSplat &&
56687 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
56688 if (ISD::isNormalLoad(Op0.getNode()) ||
56691 auto *Mem = cast<MemSDNode>(Op0);
56692 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
56695 if (SDValue BcastLd =
56696 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
56697 SDValue BcastSrc =
56698 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
56699 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
56700 return BcastLd;
56701 }
56702 }
56703 }
56704
56705 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
56706 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
56707 Subtarget.useAVX512Regs()) {
56708 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
56709 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
56710 Res = DAG.getBitcast(ShuffleVT, Res);
56711 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
56712 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
56713 return DAG.getBitcast(VT, Res);
56714 }
56715
56716 return SDValue();
56717}
56718
56721 const X86Subtarget &Subtarget) {
56722 EVT VT = N->getValueType(0);
56723 EVT SrcVT = N->getOperand(0).getValueType();
56724 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56725 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
56726
56727 if (VT.getVectorElementType() == MVT::i1) {
56728 // Attempt to constant fold.
56729 unsigned SubSizeInBits = SrcVT.getSizeInBits();
56731 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
56732 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
56733 if (!C) break;
56734 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
56735 if (I == (E - 1)) {
56736 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
56737 if (TLI.isTypeLegal(IntVT))
56738 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
56739 }
56740 }
56741
56742 // Don't do anything else for i1 vectors.
56743 return SDValue();
56744 }
56745
56746 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
56747 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
56748 DCI, Subtarget))
56749 return R;
56750 }
56751
56752 return SDValue();
56753}
56754
56757 const X86Subtarget &Subtarget) {
56758 if (DCI.isBeforeLegalizeOps())
56759 return SDValue();
56760
56761 MVT OpVT = N->getSimpleValueType(0);
56762
56763 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
56764
56765 SDLoc dl(N);
56766 SDValue Vec = N->getOperand(0);
56767 SDValue SubVec = N->getOperand(1);
56768
56769 uint64_t IdxVal = N->getConstantOperandVal(2);
56770 MVT SubVecVT = SubVec.getSimpleValueType();
56771
56772 if (Vec.isUndef() && SubVec.isUndef())
56773 return DAG.getUNDEF(OpVT);
56774
56775 // Inserting undefs/zeros into zeros/undefs is a zero vector.
56776 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
56777 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
56778 return getZeroVector(OpVT, Subtarget, DAG, dl);
56779
56781 // If we're inserting into a zero vector and then into a larger zero vector,
56782 // just insert into the larger zero vector directly.
56783 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56785 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
56786 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56787 getZeroVector(OpVT, Subtarget, DAG, dl),
56788 SubVec.getOperand(1),
56789 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
56790 }
56791
56792 // If we're inserting into a zero vector and our input was extracted from an
56793 // insert into a zero vector of the same type and the extraction was at
56794 // least as large as the original insertion. Just insert the original
56795 // subvector into a zero vector.
56796 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
56797 isNullConstant(SubVec.getOperand(1)) &&
56799 SDValue Ins = SubVec.getOperand(0);
56800 if (isNullConstant(Ins.getOperand(2)) &&
56801 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
56802 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
56803 SubVecVT.getFixedSizeInBits())
56804 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56805 getZeroVector(OpVT, Subtarget, DAG, dl),
56806 Ins.getOperand(1), N->getOperand(2));
56807 }
56808 }
56809
56810 // Stop here if this is an i1 vector.
56811 if (IsI1Vector)
56812 return SDValue();
56813
56814 // Eliminate an intermediate vector widening:
56815 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
56816 // insert_subvector X, Y, Idx
56817 // TODO: This is a more general version of a DAGCombiner fold, can we move it
56818 // there?
56819 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56820 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
56821 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
56822 SubVec.getOperand(1), N->getOperand(2));
56823
56824 // If this is an insert of an extract, combine to a shuffle. Don't do this
56825 // if the insert or extract can be represented with a subregister operation.
56826 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56827 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
56828 (IdxVal != 0 ||
56829 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
56830 int ExtIdxVal = SubVec.getConstantOperandVal(1);
56831 if (ExtIdxVal != 0) {
56832 int VecNumElts = OpVT.getVectorNumElements();
56833 int SubVecNumElts = SubVecVT.getVectorNumElements();
56834 SmallVector<int, 64> Mask(VecNumElts);
56835 // First create an identity shuffle mask.
56836 for (int i = 0; i != VecNumElts; ++i)
56837 Mask[i] = i;
56838 // Now insert the extracted portion.
56839 for (int i = 0; i != SubVecNumElts; ++i)
56840 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
56841
56842 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
56843 }
56844 }
56845
56846 // Match concat_vector style patterns.
56847 SmallVector<SDValue, 2> SubVectorOps;
56848 if (collectConcatOps(N, SubVectorOps, DAG)) {
56849 if (SDValue Fold =
56850 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
56851 return Fold;
56852
56853 // If we're inserting all zeros into the upper half, change this to
56854 // a concat with zero. We will match this to a move
56855 // with implicit upper bit zeroing during isel.
56856 // We do this here because we don't want combineConcatVectorOps to
56857 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
56858 if (SubVectorOps.size() == 2 &&
56859 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
56860 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56861 getZeroVector(OpVT, Subtarget, DAG, dl),
56862 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
56863
56864 // Attempt to recursively combine to a shuffle.
56865 if (all_of(SubVectorOps, [](SDValue SubOp) {
56866 return isTargetShuffle(SubOp.getOpcode());
56867 })) {
56868 SDValue Op(N, 0);
56869 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
56870 return Res;
56871 }
56872 }
56873
56874 // If this is a broadcast insert into an upper undef, use a larger broadcast.
56875 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
56876 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
56877
56878 // If this is a broadcast load inserted into an upper undef, use a larger
56879 // broadcast load.
56880 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
56881 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
56882 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
56883 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
56884 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
56885 SDValue BcastLd =
56887 MemIntr->getMemoryVT(),
56888 MemIntr->getMemOperand());
56889 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
56890 return BcastLd;
56891 }
56892
56893 // If we're splatting the lower half subvector of a full vector load into the
56894 // upper half, attempt to create a subvector broadcast.
56895 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
56896 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
56897 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
56898 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
56899 if (VecLd && SubLd &&
56900 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
56901 SubVec.getValueSizeInBits() / 8, 0))
56902 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
56903 SubLd, 0, DAG);
56904 }
56905
56906 return SDValue();
56907}
56908
56909/// If we are extracting a subvector of a vector select and the select condition
56910/// is composed of concatenated vectors, try to narrow the select width. This
56911/// is a common pattern for AVX1 integer code because 256-bit selects may be
56912/// legal, but there is almost no integer math/logic available for 256-bit.
56913/// This function should only be called with legal types (otherwise, the calls
56914/// to get simple value types will assert).
56916 SelectionDAG &DAG) {
56917 SDValue Sel = Ext->getOperand(0);
56918 if (Sel.getOpcode() != ISD::VSELECT ||
56919 !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
56920 return SDValue();
56921
56922 // Note: We assume simple value types because this should only be called with
56923 // legal operations/types.
56924 // TODO: This can be extended to handle extraction to 256-bits.
56925 MVT VT = Ext->getSimpleValueType(0);
56926 if (!VT.is128BitVector())
56927 return SDValue();
56928
56929 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
56930 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
56931 return SDValue();
56932
56933 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
56934 MVT SelVT = Sel.getSimpleValueType();
56935 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
56936 "Unexpected vector type with legal operations");
56937
56938 unsigned SelElts = SelVT.getVectorNumElements();
56939 unsigned CastedElts = WideVT.getVectorNumElements();
56940 unsigned ExtIdx = Ext->getConstantOperandVal(1);
56941 if (SelElts % CastedElts == 0) {
56942 // The select has the same or more (narrower) elements than the extract
56943 // operand. The extraction index gets scaled by that factor.
56944 ExtIdx *= (SelElts / CastedElts);
56945 } else if (CastedElts % SelElts == 0) {
56946 // The select has less (wider) elements than the extract operand. Make sure
56947 // that the extraction index can be divided evenly.
56948 unsigned IndexDivisor = CastedElts / SelElts;
56949 if (ExtIdx % IndexDivisor != 0)
56950 return SDValue();
56951 ExtIdx /= IndexDivisor;
56952 } else {
56953 llvm_unreachable("Element count of simple vector types are not divisible?");
56954 }
56955
56956 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
56957 unsigned NarrowElts = SelElts / NarrowingFactor;
56958 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
56959 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
56960 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
56961 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
56962 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
56963 return DAG.getBitcast(VT, NarrowSel);
56964}
56965
56968 const X86Subtarget &Subtarget) {
56969 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
56970 // eventually get combined/lowered into ANDNP) with a concatenated operand,
56971 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
56972 // We let generic combining take over from there to simplify the
56973 // insert/extract and 'not'.
56974 // This pattern emerges during AVX1 legalization. We handle it before lowering
56975 // to avoid complications like splitting constant vector loads.
56976
56977 // Capture the original wide type in the likely case that we need to bitcast
56978 // back to this type.
56979 if (!N->getValueType(0).isSimple())
56980 return SDValue();
56981
56982 MVT VT = N->getSimpleValueType(0);
56983 SDValue InVec = N->getOperand(0);
56984 unsigned IdxVal = N->getConstantOperandVal(1);
56985 SDValue InVecBC = peekThroughBitcasts(InVec);
56986 EVT InVecVT = InVec.getValueType();
56987 unsigned SizeInBits = VT.getSizeInBits();
56988 unsigned InSizeInBits = InVecVT.getSizeInBits();
56989 unsigned NumSubElts = VT.getVectorNumElements();
56990 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56991 SDLoc DL(N);
56992
56993 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
56994 TLI.isTypeLegal(InVecVT) &&
56995 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
56996 auto isConcatenatedNot = [](SDValue V) {
56997 V = peekThroughBitcasts(V);
56998 if (!isBitwiseNot(V))
56999 return false;
57000 SDValue NotOp = V->getOperand(0);
57002 };
57003 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
57004 isConcatenatedNot(InVecBC.getOperand(1))) {
57005 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
57006 SDValue Concat = splitVectorIntBinary(InVecBC, DAG, SDLoc(InVecBC));
57007 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
57008 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
57009 }
57010 }
57011
57012 if (DCI.isBeforeLegalizeOps())
57013 return SDValue();
57014
57015 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
57016 return V;
57017
57019 return getZeroVector(VT, Subtarget, DAG, DL);
57020
57021 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
57022 if (VT.getScalarType() == MVT::i1)
57023 return DAG.getConstant(1, DL, VT);
57024 return getOnesVector(VT, DAG, DL);
57025 }
57026
57027 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
57028 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
57029
57030 // If we are extracting from an insert into a larger vector, replace with a
57031 // smaller insert if we don't access less than the original subvector. Don't
57032 // do this for i1 vectors.
57033 // TODO: Relax the matching indices requirement?
57034 if (VT.getVectorElementType() != MVT::i1 &&
57035 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
57036 IdxVal == InVec.getConstantOperandVal(2) &&
57037 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
57038 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
57039 InVec.getOperand(0), N->getOperand(1));
57040 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
57041 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
57042 InVec.getOperand(1),
57043 DAG.getVectorIdxConstant(NewIdxVal, DL));
57044 }
57045
57046 // If we're extracting an upper subvector from a broadcast we should just
57047 // extract the lowest subvector instead which should allow
57048 // SimplifyDemandedVectorElts do more simplifications.
57049 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
57051 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
57052 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
57053
57054 // If we're extracting a broadcasted subvector, just use the lowest subvector.
57055 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
57056 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
57057 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
57058
57059 // Attempt to extract from the source of a shuffle vector.
57060 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
57061 SmallVector<int, 32> ShuffleMask;
57062 SmallVector<int, 32> ScaledMask;
57063 SmallVector<SDValue, 2> ShuffleInputs;
57064 unsigned NumSubVecs = InSizeInBits / SizeInBits;
57065 // Decode the shuffle mask and scale it so its shuffling subvectors.
57066 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
57067 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
57068 unsigned SubVecIdx = IdxVal / NumSubElts;
57069 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
57070 return DAG.getUNDEF(VT);
57071 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
57072 return getZeroVector(VT, Subtarget, DAG, DL);
57073 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
57074 if (Src.getValueSizeInBits() == InSizeInBits) {
57075 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
57076 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
57077 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
57078 DL, SizeInBits);
57079 }
57080 }
57081 }
57082
57083 auto IsExtractFree = [](SDValue V) {
57084 V = peekThroughBitcasts(V);
57085 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
57086 return true;
57088 return true;
57089 return V.isUndef();
57090 };
57091
57092 // If we're extracting the lowest subvector and we're the only user,
57093 // we may be able to perform this with a smaller vector width.
57094 unsigned InOpcode = InVec.getOpcode();
57095 if (InVec.hasOneUse()) {
57096 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
57097 // v2f64 CVTDQ2PD(v4i32).
57098 if (InOpcode == ISD::SINT_TO_FP &&
57099 InVec.getOperand(0).getValueType() == MVT::v4i32) {
57100 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
57101 }
57102 // v2f64 CVTUDQ2PD(v4i32).
57103 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
57104 InVec.getOperand(0).getValueType() == MVT::v4i32) {
57105 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
57106 }
57107 // v2f64 CVTPS2PD(v4f32).
57108 if (InOpcode == ISD::FP_EXTEND &&
57109 InVec.getOperand(0).getValueType() == MVT::v4f32) {
57110 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
57111 }
57112 }
57113 // v4i32 CVTPS2DQ(v4f32).
57114 if (InOpcode == ISD::FP_TO_SINT && VT == MVT::v4i32) {
57115 SDValue Src = InVec.getOperand(0);
57116 if (Src.getValueType().getScalarType() == MVT::f32)
57117 return DAG.getNode(InOpcode, DL, VT,
57118 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
57119 }
57120 if (IdxVal == 0 &&
57121 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
57122 (SizeInBits == 128 || SizeInBits == 256) &&
57123 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
57124 SDValue Ext = InVec.getOperand(0);
57125 if (Ext.getValueSizeInBits() > SizeInBits)
57126 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
57127 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
57128 return DAG.getNode(ExtOp, DL, VT, Ext);
57129 }
57130 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
57131 InVec.getOperand(0).getValueType().is256BitVector() &&
57132 InVec.getOperand(1).getValueType().is256BitVector() &&
57133 InVec.getOperand(2).getValueType().is256BitVector()) {
57134 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
57135 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
57136 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
57137 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
57138 }
57139 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
57140 (SizeInBits == 128 || SizeInBits == 256)) {
57141 SDValue InVecSrc = InVec.getOperand(0);
57142 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
57143 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
57144 return DAG.getNode(InOpcode, DL, VT, Ext);
57145 }
57146 if ((InOpcode == X86ISD::CMPP || InOpcode == X86ISD::PCMPEQ ||
57147 InOpcode == X86ISD::PCMPGT) &&
57148 (IsExtractFree(InVec.getOperand(0)) ||
57149 IsExtractFree(InVec.getOperand(1))) &&
57150 SizeInBits == 128) {
57151 SDValue Ext0 =
57152 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
57153 SDValue Ext1 =
57154 extractSubVector(InVec.getOperand(1), IdxVal, DAG, DL, SizeInBits);
57155 if (InOpcode == X86ISD::CMPP)
57156 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, InVec.getOperand(2));
57157 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1);
57158 }
57159 if (InOpcode == X86ISD::MOVDDUP &&
57160 (SizeInBits == 128 || SizeInBits == 256)) {
57161 SDValue Ext0 =
57162 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
57163 return DAG.getNode(InOpcode, DL, VT, Ext0);
57164 }
57165 }
57166
57167 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
57168 // as this is very likely to fold into a shuffle/truncation.
57169 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
57170 InVecVT.getScalarSizeInBits() == 64 &&
57171 InVec.getConstantOperandAPInt(1) == 32) {
57172 SDValue Ext =
57173 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
57174 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
57175 }
57176
57177 return SDValue();
57178}
57179
57181 EVT VT = N->getValueType(0);
57182 SDValue Src = N->getOperand(0);
57183 SDLoc DL(N);
57184
57185 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
57186 // This occurs frequently in our masked scalar intrinsic code and our
57187 // floating point select lowering with AVX512.
57188 // TODO: SimplifyDemandedBits instead?
57189 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
57190 isOneConstant(Src.getOperand(1)))
57191 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
57192
57193 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
57194 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57195 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
57196 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
57197 isNullConstant(Src.getOperand(1)))
57198 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
57199 Src.getOperand(1));
57200
57201 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
57202 // TODO: Move to DAGCombine/SimplifyDemandedBits?
57203 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
57204 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
57205 if (Op.getValueType() != MVT::i64)
57206 return SDValue();
57207 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
57208 if (Op.getOpcode() == Opc &&
57209 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
57210 return Op.getOperand(0);
57211 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
57212 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
57213 if (Ld->getExtensionType() == Ext &&
57214 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
57215 return Op;
57216 if (IsZeroExt) {
57217 KnownBits Known = DAG.computeKnownBits(Op);
57218 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
57219 return Op;
57220 }
57221 return SDValue();
57222 };
57223
57224 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
57225 return DAG.getBitcast(
57226 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57227 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
57228
57229 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
57230 return DAG.getBitcast(
57231 VT,
57232 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
57233 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57234 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
57235 }
57236
57237 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
57238 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
57239 Src.getOperand(0).getValueType() == MVT::x86mmx)
57240 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
57241
57242 // See if we're broadcasting the scalar value, in which case just reuse that.
57243 // Ensure the same SDValue from the SDNode use is being used.
57244 if (VT.getScalarType() == Src.getValueType())
57245 for (SDNode *User : Src->uses())
57246 if (User->getOpcode() == X86ISD::VBROADCAST &&
57247 Src == User->getOperand(0)) {
57248 unsigned SizeInBits = VT.getFixedSizeInBits();
57249 unsigned BroadcastSizeInBits =
57250 User->getValueSizeInBits(0).getFixedValue();
57251 if (BroadcastSizeInBits == SizeInBits)
57252 return SDValue(User, 0);
57253 if (BroadcastSizeInBits > SizeInBits)
57254 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
57255 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
57256 // coverage.
57257 }
57258
57259 return SDValue();
57260}
57261
57262// Simplify PMULDQ and PMULUDQ operations.
57265 const X86Subtarget &Subtarget) {
57266 SDValue LHS = N->getOperand(0);
57267 SDValue RHS = N->getOperand(1);
57268
57269 // Canonicalize constant to RHS.
57272 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
57273
57274 // Multiply by zero.
57275 // Don't return RHS as it may contain UNDEFs.
57276 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
57277 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
57278
57279 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
57280 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57281 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
57282 return SDValue(N, 0);
57283
57284 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
57285 // convert it to any_extend_invec, due to the LegalOperations check, do the
57286 // conversion directly to a vector shuffle manually. This exposes combine
57287 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
57288 // combineX86ShufflesRecursively on SSE4.1 targets.
57289 // FIXME: This is basically a hack around several other issues related to
57290 // ANY_EXTEND_VECTOR_INREG.
57291 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
57292 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57293 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57294 LHS.getOperand(0).getValueType() == MVT::v4i32) {
57295 SDLoc dl(N);
57296 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
57297 LHS.getOperand(0), { 0, -1, 1, -1 });
57298 LHS = DAG.getBitcast(MVT::v2i64, LHS);
57299 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57300 }
57301 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
57302 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57303 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57304 RHS.getOperand(0).getValueType() == MVT::v4i32) {
57305 SDLoc dl(N);
57306 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
57307 RHS.getOperand(0), { 0, -1, 1, -1 });
57308 RHS = DAG.getBitcast(MVT::v2i64, RHS);
57309 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57310 }
57311
57312 return SDValue();
57313}
57314
57315// Simplify VPMADDUBSW/VPMADDWD operations.
57318 MVT VT = N->getSimpleValueType(0);
57319 SDValue LHS = N->getOperand(0);
57320 SDValue RHS = N->getOperand(1);
57321 unsigned Opc = N->getOpcode();
57322 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
57323 assert((Opc == X86ISD::VPMADDWD || Opc == X86ISD::VPMADDUBSW) &&
57324 "Unexpected PMADD opcode");
57325
57326 // Multiply by zero.
57327 // Don't return LHS/RHS as it may contain UNDEFs.
57328 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
57330 return DAG.getConstant(0, SDLoc(N), VT);
57331
57332 // Constant folding.
57333 APInt LHSUndefs, RHSUndefs;
57334 SmallVector<APInt> LHSBits, RHSBits;
57335 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
57336 unsigned DstEltBits = VT.getScalarSizeInBits();
57337 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
57338 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
57339 SmallVector<APInt> Result;
57340 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
57341 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
57342 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
57343 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
57344 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
57345 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
57346 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
57347 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
57348 Result.push_back(Res);
57349 }
57350 return getConstVector(Result, VT, DAG, SDLoc(N));
57351 }
57352
57353 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57354 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57355 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57356 return SDValue(N, 0);
57357
57358 return SDValue();
57359}
57360
57363 const X86Subtarget &Subtarget) {
57364 EVT VT = N->getValueType(0);
57365 SDValue In = N->getOperand(0);
57366 unsigned Opcode = N->getOpcode();
57367 unsigned InOpcode = In.getOpcode();
57368 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57369 SDLoc DL(N);
57370
57371 // Try to merge vector loads and extend_inreg to an extload.
57372 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
57373 In.hasOneUse()) {
57374 auto *Ld = cast<LoadSDNode>(In);
57375 if (Ld->isSimple()) {
57376 MVT SVT = In.getSimpleValueType().getVectorElementType();
57379 : ISD::ZEXTLOAD;
57380 EVT MemVT = VT.changeVectorElementType(SVT);
57381 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
57382 SDValue Load = DAG.getExtLoad(
57383 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
57384 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
57385 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
57386 return Load;
57387 }
57388 }
57389 }
57390
57391 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
57392 if (Opcode == InOpcode)
57393 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
57394
57395 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
57396 // -> EXTEND_VECTOR_INREG(X).
57397 // TODO: Handle non-zero subvector indices.
57398 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
57399 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
57400 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
57401 In.getValueSizeInBits())
57402 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
57403
57404 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
57405 // TODO: Move to DAGCombine?
57406 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
57407 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
57408 In.getValueSizeInBits() == VT.getSizeInBits()) {
57409 unsigned NumElts = VT.getVectorNumElements();
57410 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
57411 EVT EltVT = In.getOperand(0).getValueType();
57412 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
57413 for (unsigned I = 0; I != NumElts; ++I)
57414 Elts[I * Scale] = In.getOperand(I);
57415 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
57416 }
57417
57418 // Attempt to combine as a shuffle on SSE41+ targets.
57419 if (Subtarget.hasSSE41()) {
57420 SDValue Op(N, 0);
57421 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
57422 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
57423 return Res;
57424 }
57425
57426 return SDValue();
57427}
57428
57431 EVT VT = N->getValueType(0);
57432
57433 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
57434 return DAG.getConstant(0, SDLoc(N), VT);
57435
57436 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57437 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57438 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57439 return SDValue(N, 0);
57440
57441 return SDValue();
57442}
57443
57444// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
57445// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
57446// extra instructions between the conversion due to going to scalar and back.
57448 const X86Subtarget &Subtarget) {
57449 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
57450 return SDValue();
57451
57452 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
57453 return SDValue();
57454
57455 if (N->getValueType(0) != MVT::f32 ||
57456 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
57457 return SDValue();
57458
57459 SDLoc dl(N);
57460 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
57461 N->getOperand(0).getOperand(0));
57462 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
57463 DAG.getTargetConstant(4, dl, MVT::i32));
57464 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
57465 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
57466 DAG.getIntPtrConstant(0, dl));
57467}
57468
57471 const X86Subtarget &Subtarget) {
57472 EVT VT = N->getValueType(0);
57473 bool IsStrict = N->isStrictFPOpcode();
57474 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57475 EVT SrcVT = Src.getValueType();
57476
57477 SDLoc dl(N);
57478 if (SrcVT.getScalarType() == MVT::bf16) {
57479 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
57480 !IsStrict && Src.getOperand(0).getValueType() == VT)
57481 return Src.getOperand(0);
57482
57483 if (!SrcVT.isVector())
57484 return SDValue();
57485
57486 assert(!IsStrict && "Strict FP doesn't support BF16");
57487 if (VT.getVectorElementType() == MVT::f64) {
57488 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
57489 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
57490 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
57491 }
57492 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
57493 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
57494 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
57495 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
57496 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
57497 return DAG.getBitcast(VT, Src);
57498 }
57499
57500 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57501 return SDValue();
57502
57503 if (Subtarget.hasFP16())
57504 return SDValue();
57505
57506 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
57507 return SDValue();
57508
57509 if (VT.getVectorElementType() != MVT::f32 &&
57510 VT.getVectorElementType() != MVT::f64)
57511 return SDValue();
57512
57513 unsigned NumElts = VT.getVectorNumElements();
57514 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57515 return SDValue();
57516
57517 // Convert the input to vXi16.
57518 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
57519 Src = DAG.getBitcast(IntVT, Src);
57520
57521 // Widen to at least 8 input elements.
57522 if (NumElts < 8) {
57523 unsigned NumConcats = 8 / NumElts;
57524 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
57525 : DAG.getConstant(0, dl, IntVT);
57526 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
57527 Ops[0] = Src;
57528 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
57529 }
57530
57531 // Destination is vXf32 with at least 4 elements.
57532 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
57533 std::max(4U, NumElts));
57534 SDValue Cvt, Chain;
57535 if (IsStrict) {
57536 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
57537 {N->getOperand(0), Src});
57538 Chain = Cvt.getValue(1);
57539 } else {
57540 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
57541 }
57542
57543 if (NumElts < 4) {
57544 assert(NumElts == 2 && "Unexpected size");
57545 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
57546 DAG.getIntPtrConstant(0, dl));
57547 }
57548
57549 if (IsStrict) {
57550 // Extend to the original VT if necessary.
57551 if (Cvt.getValueType() != VT) {
57552 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
57553 {Chain, Cvt});
57554 Chain = Cvt.getValue(1);
57555 }
57556 return DAG.getMergeValues({Cvt, Chain}, dl);
57557 }
57558
57559 // Extend to the original VT if necessary.
57560 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
57561}
57562
57563// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
57564// from. Limit this to cases where the loads have the same input chain and the
57565// output chains are unused. This avoids any memory ordering issues.
57568 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
57569 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
57570 "Unknown broadcast load type");
57571
57572 // Only do this if the chain result is unused.
57573 if (N->hasAnyUseOfValue(1))
57574 return SDValue();
57575
57576 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
57577
57578 SDValue Ptr = MemIntrin->getBasePtr();
57579 SDValue Chain = MemIntrin->getChain();
57580 EVT VT = N->getSimpleValueType(0);
57581 EVT MemVT = MemIntrin->getMemoryVT();
57582
57583 // Look at other users of our base pointer and try to find a wider broadcast.
57584 // The input chain and the size of the memory VT must match.
57585 for (SDNode *User : Ptr->uses())
57586 if (User != N && User->getOpcode() == N->getOpcode() &&
57587 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
57588 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
57589 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
57590 MemVT.getSizeInBits() &&
57591 !User->hasAnyUseOfValue(1) &&
57592 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
57593 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
57594 VT.getSizeInBits());
57595 Extract = DAG.getBitcast(VT, Extract);
57596 return DCI.CombineTo(N, Extract, SDValue(User, 1));
57597 }
57598
57599 return SDValue();
57600}
57601
57603 const X86Subtarget &Subtarget) {
57604 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57605 return SDValue();
57606
57607 bool IsStrict = N->isStrictFPOpcode();
57608 EVT VT = N->getValueType(0);
57609 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57610 EVT SrcVT = Src.getValueType();
57611
57612 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
57613 SrcVT.getVectorElementType() != MVT::f32)
57614 return SDValue();
57615
57616 SDLoc dl(N);
57617
57618 SDValue Cvt, Chain;
57619 unsigned NumElts = VT.getVectorNumElements();
57620 if (Subtarget.hasFP16()) {
57621 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
57622 // v4f32 (xint_to_fp v4i64))))
57623 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
57624 // v8f16 (CVTXI2P v4i64)))
57625 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
57626 Src.getNumOperands() == 2) {
57627 SDValue Cvt0, Cvt1;
57628 SDValue Op0 = Src.getOperand(0);
57629 SDValue Op1 = Src.getOperand(1);
57630 bool IsOp0Strict = Op0->isStrictFPOpcode();
57631 if (Op0.getOpcode() != Op1.getOpcode() ||
57632 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
57633 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
57634 return SDValue();
57635 }
57636 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
57637 if (IsStrict) {
57638 assert(IsOp0Strict && "Op0 must be strict node");
57639 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
57642 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57643 {Op0.getOperand(0), Op0.getOperand(1)});
57644 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57645 {Op1.getOperand(0), Op1.getOperand(1)});
57646 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57647 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
57648 }
57649 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
57651 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
57652 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
57653 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57654 }
57655 return SDValue();
57656 }
57657
57658 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57659 return SDValue();
57660
57661 // Widen to at least 4 input elements.
57662 if (NumElts < 4)
57663 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
57664 DAG.getConstantFP(0.0, dl, SrcVT));
57665
57666 // Destination is v8i16 with at least 8 elements.
57667 EVT CvtVT =
57668 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
57669 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
57670 if (IsStrict) {
57671 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
57672 {N->getOperand(0), Src, Rnd});
57673 Chain = Cvt.getValue(1);
57674 } else {
57675 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
57676 }
57677
57678 // Extract down to real number of elements.
57679 if (NumElts < 8) {
57681 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
57682 DAG.getIntPtrConstant(0, dl));
57683 }
57684
57685 Cvt = DAG.getBitcast(VT, Cvt);
57686
57687 if (IsStrict)
57688 return DAG.getMergeValues({Cvt, Chain}, dl);
57689
57690 return Cvt;
57691}
57692
57694 SDValue Src = N->getOperand(0);
57695
57696 // Turn MOVDQ2Q+simple_load into an mmx load.
57697 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
57698 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
57699
57700 if (LN->isSimple()) {
57701 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
57702 LN->getBasePtr(),
57703 LN->getPointerInfo(),
57704 LN->getOriginalAlign(),
57705 LN->getMemOperand()->getFlags());
57706 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
57707 return NewLd;
57708 }
57709 }
57710
57711 return SDValue();
57712}
57713
57716 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
57717 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57718 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
57719 return SDValue(N, 0);
57720
57721 return SDValue();
57722}
57723
57725 DAGCombinerInfo &DCI) const {
57726 SelectionDAG &DAG = DCI.DAG;
57727 switch (N->getOpcode()) {
57728 // clang-format off
57729 default: break;
57731 return combineScalarToVector(N, DAG);
57733 case X86ISD::PEXTRW:
57734 case X86ISD::PEXTRB:
57735 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
57737 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
57739 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
57741 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
57742 case ISD::VSELECT:
57743 case ISD::SELECT:
57744 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
57745 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
57746 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
57747 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
57748 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
57749 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
57750 case X86ISD::ADD:
57751 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
57752 case X86ISD::CLOAD:
57753 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
57754 case X86ISD::SBB: return combineSBB(N, DAG);
57755 case X86ISD::ADC: return combineADC(N, DAG, DCI);
57756 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
57757 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
57758 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
57759 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
57760 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
57761 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
57762 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
57763 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
57764 case ISD::AVGCEILS:
57765 case ISD::AVGCEILU:
57766 case ISD::AVGFLOORS:
57767 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
57768 case X86ISD::BEXTR:
57769 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
57770 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
57771 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
57772 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
57773 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
57775 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
57776 case ISD::SINT_TO_FP:
57778 return combineSIntToFP(N, DAG, DCI, Subtarget);
57779 case ISD::UINT_TO_FP:
57781 return combineUIntToFP(N, DAG, Subtarget);
57782 case ISD::LRINT:
57783 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
57784 case ISD::FADD:
57785 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
57786 case X86ISD::VFCMULC:
57787 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
57788 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
57789 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
57790 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
57791 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
57792 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
57793 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
57794 case X86ISD::FXOR:
57795 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
57796 case X86ISD::FMIN:
57797 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
57798 case ISD::FMINNUM:
57799 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
57800 case X86ISD::CVTSI2P:
57801 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
57802 case X86ISD::CVTP2SI:
57803 case X86ISD::CVTP2UI:
57805 case X86ISD::CVTTP2SI:
57807 case X86ISD::CVTTP2UI:
57808 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
57810 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
57811 case X86ISD::BT: return combineBT(N, DAG, DCI);
57812 case ISD::ANY_EXTEND:
57813 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
57814 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
57815 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
57819 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
57820 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
57821 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
57822 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
57823 case X86ISD::PACKSS:
57824 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
57825 case X86ISD::HADD:
57826 case X86ISD::HSUB:
57827 case X86ISD::FHADD:
57828 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
57829 case X86ISD::VSHL:
57830 case X86ISD::VSRA:
57831 case X86ISD::VSRL:
57832 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
57833 case X86ISD::VSHLI:
57834 case X86ISD::VSRAI:
57835 case X86ISD::VSRLI:
57836 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
57838 case X86ISD::PINSRB:
57839 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
57840 case X86ISD::SHUFP: // Handle all target specific shuffles
57841 case X86ISD::INSERTPS:
57842 case X86ISD::EXTRQI:
57843 case X86ISD::INSERTQI:
57844 case X86ISD::VALIGN:
57845 case X86ISD::PALIGNR:
57846 case X86ISD::VSHLDQ:
57847 case X86ISD::VSRLDQ:
57848 case X86ISD::BLENDI:
57849 case X86ISD::UNPCKH:
57850 case X86ISD::UNPCKL:
57851 case X86ISD::MOVHLPS:
57852 case X86ISD::MOVLHPS:
57853 case X86ISD::PSHUFB:
57854 case X86ISD::PSHUFD:
57855 case X86ISD::PSHUFHW:
57856 case X86ISD::PSHUFLW:
57857 case X86ISD::MOVSHDUP:
57858 case X86ISD::MOVSLDUP:
57859 case X86ISD::MOVDDUP:
57860 case X86ISD::MOVSS:
57861 case X86ISD::MOVSD:
57862 case X86ISD::MOVSH:
57863 case X86ISD::VBROADCAST:
57864 case X86ISD::VPPERM:
57865 case X86ISD::VPERMI:
57866 case X86ISD::VPERMV:
57867 case X86ISD::VPERMV3:
57868 case X86ISD::VPERMIL2:
57869 case X86ISD::VPERMILPI:
57870 case X86ISD::VPERMILPV:
57871 case X86ISD::VPERM2X128:
57872 case X86ISD::SHUF128:
57873 case X86ISD::VZEXT_MOVL:
57874 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
57875 case X86ISD::FMADD_RND:
57876 case X86ISD::FMSUB:
57878 case X86ISD::FMSUB_RND:
57879 case X86ISD::FNMADD:
57881 case X86ISD::FNMADD_RND:
57882 case X86ISD::FNMSUB:
57884 case X86ISD::FNMSUB_RND:
57885 case ISD::FMA:
57886 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
57889 case X86ISD::FMADDSUB:
57890 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
57891 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
57892 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
57893 case X86ISD::MGATHER:
57894 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
57895 case ISD::MGATHER:
57896 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
57897 case X86ISD::PCMPEQ:
57898 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
57899 case X86ISD::PMULDQ:
57900 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
57901 case X86ISD::VPMADDUBSW:
57902 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
57903 case X86ISD::KSHIFTL:
57904 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
57905 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
57907 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
57909 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
57911 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
57912 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
57913 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
57914 // clang-format on
57915 }
57916
57917 return SDValue();
57918}
57919
57921 return false;
57922}
57923
57924// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
57926 EVT ExtVT) const {
57927 return Subtarget.hasAVX512() || !VT.isVector();
57928}
57929
57930bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
57931 if (!isTypeLegal(VT))
57932 return false;
57933
57934 // There are no vXi8 shifts.
57935 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
57936 return false;
57937
57938 // TODO: Almost no 8-bit ops are desirable because they have no actual
57939 // size/speed advantages vs. 32-bit ops, but they do have a major
57940 // potential disadvantage by causing partial register stalls.
57941 //
57942 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
57943 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
57944 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
57945 // check for a constant operand to the multiply.
57946 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
57947 return false;
57948
57949 // i16 instruction encodings are longer and some i16 instructions are slow,
57950 // so those are not desirable.
57951 if (VT == MVT::i16) {
57952 switch (Opc) {
57953 default:
57954 break;
57955 case ISD::LOAD:
57956 case ISD::SIGN_EXTEND:
57957 case ISD::ZERO_EXTEND:
57958 case ISD::ANY_EXTEND:
57959 case ISD::MUL:
57960 return false;
57961 case ISD::SHL:
57962 case ISD::SRA:
57963 case ISD::SRL:
57964 case ISD::SUB:
57965 case ISD::ADD:
57966 case ISD::AND:
57967 case ISD::OR:
57968 case ISD::XOR:
57969 // NDD instruction never has "partial register write" issue b/c it has
57970 // destination register's upper bits [63:OSIZE]) zeroed even when
57971 // OSIZE=8/16.
57972 return Subtarget.hasNDD();
57973 }
57974 }
57975
57976 // Any legal type not explicitly accounted for above here is desirable.
57977 return true;
57978}
57979
57982 int JTI,
57983 SelectionDAG &DAG) const {
57984 const Module *M = DAG.getMachineFunction().getMMI().getModule();
57985 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
57986 if (IsCFProtectionSupported) {
57987 // In case control-flow branch protection is enabled, we need to add
57988 // notrack prefix to the indirect branch.
57989 // In order to do that we create NT_BRIND SDNode.
57990 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
57991 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Value, dl);
57992 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, JTInfo, Addr);
57993 }
57994
57995 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
57996}
57997
58000 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
58002 EVT VT = LogicOp->getValueType(0);
58003 EVT OpVT = SETCC0->getOperand(0).getValueType();
58004 if (!VT.isInteger())
58006
58007 if (VT.isVector())
58012
58013 // Don't use `NotAnd` as even though `not` is generally shorter code size than
58014 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
58015 // `NotAnd` applies, `AddAnd` does as well.
58016 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
58017 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
58019}
58020
58022 EVT VT = Op.getValueType();
58023 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
58024 isa<ConstantSDNode>(Op.getOperand(1));
58025
58026 // i16 is legal, but undesirable since i16 instruction encodings are longer
58027 // and some i16 instructions are slow.
58028 // 8-bit multiply-by-constant can usually be expanded to something cheaper
58029 // using LEA and/or other ALU ops.
58030 if (VT != MVT::i16 && !Is8BitMulByConstant)
58031 return false;
58032
58033 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
58034 if (!Op.hasOneUse())
58035 return false;
58036 SDNode *User = *Op->use_begin();
58038 return false;
58039 auto *Ld = cast<LoadSDNode>(Load);
58040 auto *St = cast<StoreSDNode>(User);
58041 return Ld->getBasePtr() == St->getBasePtr();
58042 };
58043
58044 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
58045 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
58046 return false;
58047 if (!Op.hasOneUse())
58048 return false;
58049 SDNode *User = *Op->use_begin();
58050 if (User->getOpcode() != ISD::ATOMIC_STORE)
58051 return false;
58052 auto *Ld = cast<AtomicSDNode>(Load);
58053 auto *St = cast<AtomicSDNode>(User);
58054 return Ld->getBasePtr() == St->getBasePtr();
58055 };
58056
58057 bool Commute = false;
58058 switch (Op.getOpcode()) {
58059 default: return false;
58060 case ISD::SIGN_EXTEND:
58061 case ISD::ZERO_EXTEND:
58062 case ISD::ANY_EXTEND:
58063 break;
58064 case ISD::SHL:
58065 case ISD::SRA:
58066 case ISD::SRL: {
58067 SDValue N0 = Op.getOperand(0);
58068 // Look out for (store (shl (load), x)).
58069 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
58070 return false;
58071 break;
58072 }
58073 case ISD::ADD:
58074 case ISD::MUL:
58075 case ISD::AND:
58076 case ISD::OR:
58077 case ISD::XOR:
58078 Commute = true;
58079 [[fallthrough]];
58080 case ISD::SUB: {
58081 SDValue N0 = Op.getOperand(0);
58082 SDValue N1 = Op.getOperand(1);
58083 // Avoid disabling potential load folding opportunities.
58084 if (X86::mayFoldLoad(N1, Subtarget) &&
58085 (!Commute || !isa<ConstantSDNode>(N0) ||
58086 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
58087 return false;
58088 if (X86::mayFoldLoad(N0, Subtarget) &&
58089 ((Commute && !isa<ConstantSDNode>(N1)) ||
58090 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
58091 return false;
58092 if (IsFoldableAtomicRMW(N0, Op) ||
58093 (Commute && IsFoldableAtomicRMW(N1, Op)))
58094 return false;
58095 }
58096 }
58097
58098 PVT = MVT::i32;
58099 return true;
58100}
58101
58102//===----------------------------------------------------------------------===//
58103// X86 Inline Assembly Support
58104//===----------------------------------------------------------------------===//
58105
58106// Helper to match a string separated by whitespace.
58108 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
58109
58110 for (StringRef Piece : Pieces) {
58111 if (!S.starts_with(Piece)) // Check if the piece matches.
58112 return false;
58113
58114 S = S.substr(Piece.size());
58116 if (Pos == 0) // We matched a prefix.
58117 return false;
58118
58119 S = S.substr(Pos);
58120 }
58121
58122 return S.empty();
58123}
58124
58126
58127 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
58128 if (llvm::is_contained(AsmPieces, "~{cc}") &&
58129 llvm::is_contained(AsmPieces, "~{flags}") &&
58130 llvm::is_contained(AsmPieces, "~{fpsr}")) {
58131
58132 if (AsmPieces.size() == 3)
58133 return true;
58134 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
58135 return true;
58136 }
58137 }
58138 return false;
58139}
58140
58142 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
58143
58144 const std::string &AsmStr = IA->getAsmString();
58145
58146 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
58147 if (!Ty || Ty->getBitWidth() % 16 != 0)
58148 return false;
58149
58150 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
58151 SmallVector<StringRef, 4> AsmPieces;
58152 SplitString(AsmStr, AsmPieces, ";\n");
58153
58154 switch (AsmPieces.size()) {
58155 default: return false;
58156 case 1:
58157 // FIXME: this should verify that we are targeting a 486 or better. If not,
58158 // we will turn this bswap into something that will be lowered to logical
58159 // ops instead of emitting the bswap asm. For now, we don't support 486 or
58160 // lower so don't worry about this.
58161 // bswap $0
58162 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
58163 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
58164 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
58165 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
58166 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
58167 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
58168 // No need to check constraints, nothing other than the equivalent of
58169 // "=r,0" would be valid here.
58171 }
58172
58173 // rorw $$8, ${0:w} --> llvm.bswap.i16
58174 if (CI->getType()->isIntegerTy(16) &&
58175 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
58176 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
58177 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
58178 AsmPieces.clear();
58179 StringRef ConstraintsStr = IA->getConstraintString();
58180 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
58181 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
58182 if (clobbersFlagRegisters(AsmPieces))
58184 }
58185 break;
58186 case 3:
58187 if (CI->getType()->isIntegerTy(32) &&
58188 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
58189 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
58190 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
58191 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
58192 AsmPieces.clear();
58193 StringRef ConstraintsStr = IA->getConstraintString();
58194 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
58195 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
58196 if (clobbersFlagRegisters(AsmPieces))
58198 }
58199
58200 if (CI->getType()->isIntegerTy(64)) {
58201 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
58202 if (Constraints.size() >= 2 &&
58203 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
58204 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
58205 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
58206 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
58207 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
58208 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
58210 }
58211 }
58212 break;
58213 }
58214 return false;
58215}
58216
58219 .Case("{@cca}", X86::COND_A)
58220 .Case("{@ccae}", X86::COND_AE)
58221 .Case("{@ccb}", X86::COND_B)
58222 .Case("{@ccbe}", X86::COND_BE)
58223 .Case("{@ccc}", X86::COND_B)
58224 .Case("{@cce}", X86::COND_E)
58225 .Case("{@ccz}", X86::COND_E)
58226 .Case("{@ccg}", X86::COND_G)
58227 .Case("{@ccge}", X86::COND_GE)
58228 .Case("{@ccl}", X86::COND_L)
58229 .Case("{@ccle}", X86::COND_LE)
58230 .Case("{@ccna}", X86::COND_BE)
58231 .Case("{@ccnae}", X86::COND_B)
58232 .Case("{@ccnb}", X86::COND_AE)
58233 .Case("{@ccnbe}", X86::COND_A)
58234 .Case("{@ccnc}", X86::COND_AE)
58235 .Case("{@ccne}", X86::COND_NE)
58236 .Case("{@ccnz}", X86::COND_NE)
58237 .Case("{@ccng}", X86::COND_LE)
58238 .Case("{@ccnge}", X86::COND_L)
58239 .Case("{@ccnl}", X86::COND_GE)
58240 .Case("{@ccnle}", X86::COND_G)
58241 .Case("{@ccno}", X86::COND_NO)
58242 .Case("{@ccnp}", X86::COND_NP)
58243 .Case("{@ccns}", X86::COND_NS)
58244 .Case("{@cco}", X86::COND_O)
58245 .Case("{@ccp}", X86::COND_P)
58246 .Case("{@ccs}", X86::COND_S)
58248 return Cond;
58249}
58250
58251/// Given a constraint letter, return the type of constraint for this target.
58254 if (Constraint.size() == 1) {
58255 switch (Constraint[0]) {
58256 case 'R':
58257 case 'q':
58258 case 'Q':
58259 case 'f':
58260 case 't':
58261 case 'u':
58262 case 'y':
58263 case 'x':
58264 case 'v':
58265 case 'l':
58266 case 'k': // AVX512 masking registers.
58267 return C_RegisterClass;
58268 case 'a':
58269 case 'b':
58270 case 'c':
58271 case 'd':
58272 case 'S':
58273 case 'D':
58274 case 'A':
58275 return C_Register;
58276 case 'I':
58277 case 'J':
58278 case 'K':
58279 case 'N':
58280 case 'G':
58281 case 'L':
58282 case 'M':
58283 return C_Immediate;
58284 case 'C':
58285 case 'e':
58286 case 'Z':
58287 return C_Other;
58288 default:
58289 break;
58290 }
58291 }
58292 else if (Constraint.size() == 2) {
58293 switch (Constraint[0]) {
58294 default:
58295 break;
58296 case 'W':
58297 if (Constraint[1] != 's')
58298 break;
58299 return C_Other;
58300 case 'Y':
58301 switch (Constraint[1]) {
58302 default:
58303 break;
58304 case 'z':
58305 return C_Register;
58306 case 'i':
58307 case 'm':
58308 case 'k':
58309 case 't':
58310 case '2':
58311 return C_RegisterClass;
58312 }
58313 break;
58314 case 'j':
58315 switch (Constraint[1]) {
58316 default:
58317 break;
58318 case 'r':
58319 case 'R':
58320 return C_RegisterClass;
58321 }
58322 }
58323 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58324 return C_Other;
58325 return TargetLowering::getConstraintType(Constraint);
58326}
58327
58328/// Examine constraint type and operand type and determine a weight value.
58329/// This object must already have been set up with the operand type
58330/// and the current alternative constraint selected.
58333 AsmOperandInfo &Info, const char *Constraint) const {
58335 Value *CallOperandVal = Info.CallOperandVal;
58336 // If we don't have a value, we can't do a match,
58337 // but allow it at the lowest weight.
58338 if (!CallOperandVal)
58339 return CW_Default;
58340 Type *Ty = CallOperandVal->getType();
58341 // Look at the constraint type.
58342 switch (*Constraint) {
58343 default:
58345 [[fallthrough]];
58346 case 'R':
58347 case 'q':
58348 case 'Q':
58349 case 'a':
58350 case 'b':
58351 case 'c':
58352 case 'd':
58353 case 'S':
58354 case 'D':
58355 case 'A':
58356 if (CallOperandVal->getType()->isIntegerTy())
58357 Wt = CW_SpecificReg;
58358 break;
58359 case 'f':
58360 case 't':
58361 case 'u':
58362 if (Ty->isFloatingPointTy())
58363 Wt = CW_SpecificReg;
58364 break;
58365 case 'y':
58366 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
58367 Wt = CW_SpecificReg;
58368 break;
58369 case 'Y':
58370 if (StringRef(Constraint).size() != 2)
58371 break;
58372 switch (Constraint[1]) {
58373 default:
58374 return CW_Invalid;
58375 // XMM0
58376 case 'z':
58377 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58378 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
58379 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
58380 return CW_SpecificReg;
58381 return CW_Invalid;
58382 // Conditional OpMask regs (AVX512)
58383 case 'k':
58384 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58385 return CW_Register;
58386 return CW_Invalid;
58387 // Any MMX reg
58388 case 'm':
58389 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
58390 return Wt;
58391 return CW_Invalid;
58392 // Any SSE reg when ISA >= SSE2, same as 'x'
58393 case 'i':
58394 case 't':
58395 case '2':
58396 if (!Subtarget.hasSSE2())
58397 return CW_Invalid;
58398 break;
58399 }
58400 break;
58401 case 'j':
58402 if (StringRef(Constraint).size() != 2)
58403 break;
58404 switch (Constraint[1]) {
58405 default:
58406 return CW_Invalid;
58407 case 'r':
58408 case 'R':
58409 if (CallOperandVal->getType()->isIntegerTy())
58410 Wt = CW_SpecificReg;
58411 break;
58412 }
58413 break;
58414 case 'v':
58415 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
58416 Wt = CW_Register;
58417 [[fallthrough]];
58418 case 'x':
58419 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58420 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
58421 Wt = CW_Register;
58422 break;
58423 case 'k':
58424 // Enable conditional vector operations using %k<#> registers.
58425 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58426 Wt = CW_Register;
58427 break;
58428 case 'I':
58429 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
58430 if (C->getZExtValue() <= 31)
58431 Wt = CW_Constant;
58432 break;
58433 case 'J':
58434 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58435 if (C->getZExtValue() <= 63)
58436 Wt = CW_Constant;
58437 break;
58438 case 'K':
58439 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58440 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
58441 Wt = CW_Constant;
58442 break;
58443 case 'L':
58444 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58445 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
58446 Wt = CW_Constant;
58447 break;
58448 case 'M':
58449 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58450 if (C->getZExtValue() <= 3)
58451 Wt = CW_Constant;
58452 break;
58453 case 'N':
58454 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58455 if (C->getZExtValue() <= 0xff)
58456 Wt = CW_Constant;
58457 break;
58458 case 'G':
58459 case 'C':
58460 if (isa<ConstantFP>(CallOperandVal))
58461 Wt = CW_Constant;
58462 break;
58463 case 'e':
58464 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58465 if ((C->getSExtValue() >= -0x80000000LL) &&
58466 (C->getSExtValue() <= 0x7fffffffLL))
58467 Wt = CW_Constant;
58468 break;
58469 case 'Z':
58470 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58471 if (C->getZExtValue() <= 0xffffffff)
58472 Wt = CW_Constant;
58473 break;
58474 }
58475 return Wt;
58476}
58477
58478/// Try to replace an X constraint, which matches anything, with another that
58479/// has more specific requirements based on the type of the corresponding
58480/// operand.
58482LowerXConstraint(EVT ConstraintVT) const {
58483 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
58484 // 'f' like normal targets.
58485 if (ConstraintVT.isFloatingPoint()) {
58486 if (Subtarget.hasSSE1())
58487 return "x";
58488 }
58489
58490 return TargetLowering::LowerXConstraint(ConstraintVT);
58491}
58492
58493// Lower @cc targets via setcc.
58495 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
58496 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
58498 if (Cond == X86::COND_INVALID)
58499 return SDValue();
58500 // Check that return type is valid.
58501 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
58502 OpInfo.ConstraintVT.getSizeInBits() < 8)
58503 report_fatal_error("Glue output operand is of invalid type");
58504
58505 // Get EFLAGS register. Only update chain when copyfrom is glued.
58506 if (Glue.getNode()) {
58507 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
58508 Chain = Glue.getValue(1);
58509 } else
58510 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
58511 // Extract CC code.
58512 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
58513 // Extend to 32-bits
58514 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
58515
58516 return Result;
58517}
58518
58519/// Lower the specified operand into the Ops vector.
58520/// If it is invalid, don't add anything to Ops.
58522 StringRef Constraint,
58523 std::vector<SDValue> &Ops,
58524 SelectionDAG &DAG) const {
58525 SDValue Result;
58526 char ConstraintLetter = Constraint[0];
58527 switch (ConstraintLetter) {
58528 default: break;
58529 case 'I':
58530 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58531 if (C->getZExtValue() <= 31) {
58532 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58533 Op.getValueType());
58534 break;
58535 }
58536 }
58537 return;
58538 case 'J':
58539 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58540 if (C->getZExtValue() <= 63) {
58541 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58542 Op.getValueType());
58543 break;
58544 }
58545 }
58546 return;
58547 case 'K':
58548 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58549 if (isInt<8>(C->getSExtValue())) {
58550 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58551 Op.getValueType());
58552 break;
58553 }
58554 }
58555 return;
58556 case 'L':
58557 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58558 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
58559 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
58560 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
58561 Op.getValueType());
58562 break;
58563 }
58564 }
58565 return;
58566 case 'M':
58567 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58568 if (C->getZExtValue() <= 3) {
58569 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58570 Op.getValueType());
58571 break;
58572 }
58573 }
58574 return;
58575 case 'N':
58576 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58577 if (C->getZExtValue() <= 255) {
58578 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58579 Op.getValueType());
58580 break;
58581 }
58582 }
58583 return;
58584 case 'O':
58585 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58586 if (C->getZExtValue() <= 127) {
58587 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58588 Op.getValueType());
58589 break;
58590 }
58591 }
58592 return;
58593 case 'e': {
58594 // 32-bit signed value
58595 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58597 C->getSExtValue())) {
58598 // Widen to 64 bits here to get it sign extended.
58599 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
58600 break;
58601 }
58602 // FIXME gcc accepts some relocatable values here too, but only in certain
58603 // memory models; it's complicated.
58604 }
58605 return;
58606 }
58607 case 'W': {
58608 assert(Constraint[1] == 's');
58609 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
58610 // offset.
58611 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
58612 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
58613 BA->getValueType(0)));
58614 } else {
58615 int64_t Offset = 0;
58616 if (Op->getOpcode() == ISD::ADD &&
58617 isa<ConstantSDNode>(Op->getOperand(1))) {
58618 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
58619 Op = Op->getOperand(0);
58620 }
58621 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
58622 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
58623 GA->getValueType(0), Offset));
58624 }
58625 return;
58626 }
58627 case 'Z': {
58628 // 32-bit unsigned value
58629 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58631 C->getZExtValue())) {
58632 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58633 Op.getValueType());
58634 break;
58635 }
58636 }
58637 // FIXME gcc accepts some relocatable values here too, but only in certain
58638 // memory models; it's complicated.
58639 return;
58640 }
58641 case 'i': {
58642 // Literal immediates are always ok.
58643 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
58644 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
58645 BooleanContent BCont = getBooleanContents(MVT::i64);
58646 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
58648 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
58649 : CST->getSExtValue();
58650 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
58651 break;
58652 }
58653
58654 // In any sort of PIC mode addresses need to be computed at runtime by
58655 // adding in a register or some sort of table lookup. These can't
58656 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
58657 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
58658 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
58659 return;
58660
58661 // If we are in non-pic codegen mode, we allow the address of a global (with
58662 // an optional displacement) to be used with 'i'.
58663 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
58664 // If we require an extra load to get this address, as in PIC mode, we
58665 // can't accept it.
58667 Subtarget.classifyGlobalReference(GA->getGlobal())))
58668 return;
58669 break;
58670 }
58671 }
58672
58673 if (Result.getNode()) {
58674 Ops.push_back(Result);
58675 return;
58676 }
58677 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
58678}
58679
58680/// Check if \p RC is a general purpose register class.
58681/// I.e., GR* or one of their variant.
58682static bool isGRClass(const TargetRegisterClass &RC) {
58683 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
58684 RC.hasSuperClassEq(&X86::GR16RegClass) ||
58685 RC.hasSuperClassEq(&X86::GR32RegClass) ||
58686 RC.hasSuperClassEq(&X86::GR64RegClass) ||
58687 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
58688}
58689
58690/// Check if \p RC is a vector register class.
58691/// I.e., FR* / VR* or one of their variant.
58692static bool isFRClass(const TargetRegisterClass &RC) {
58693 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
58694 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
58695 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
58696 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
58697 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
58698 RC.hasSuperClassEq(&X86::VR512RegClass);
58699}
58700
58701/// Check if \p RC is a mask register class.
58702/// I.e., VK* or one of their variant.
58703static bool isVKClass(const TargetRegisterClass &RC) {
58704 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
58705 RC.hasSuperClassEq(&X86::VK2RegClass) ||
58706 RC.hasSuperClassEq(&X86::VK4RegClass) ||
58707 RC.hasSuperClassEq(&X86::VK8RegClass) ||
58708 RC.hasSuperClassEq(&X86::VK16RegClass) ||
58709 RC.hasSuperClassEq(&X86::VK32RegClass) ||
58710 RC.hasSuperClassEq(&X86::VK64RegClass);
58711}
58712
58713static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
58714 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
58715}
58716
58717std::pair<unsigned, const TargetRegisterClass *>
58719 StringRef Constraint,
58720 MVT VT) const {
58721 // First, see if this is a constraint that directly corresponds to an LLVM
58722 // register class.
58723 if (Constraint.size() == 1) {
58724 // GCC Constraint Letters
58725 switch (Constraint[0]) {
58726 default: break;
58727 // 'A' means [ER]AX + [ER]DX.
58728 case 'A':
58729 if (Subtarget.is64Bit())
58730 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
58731 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
58732 "Expecting 64, 32 or 16 bit subtarget");
58733 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58734
58735 // TODO: Slight differences here in allocation order and leaving
58736 // RIP in the class. Do they matter any more here than they do
58737 // in the normal allocation?
58738 case 'k':
58739 if (Subtarget.hasAVX512()) {
58740 if (VT == MVT::v1i1 || VT == MVT::i1)
58741 return std::make_pair(0U, &X86::VK1RegClass);
58742 if (VT == MVT::v8i1 || VT == MVT::i8)
58743 return std::make_pair(0U, &X86::VK8RegClass);
58744 if (VT == MVT::v16i1 || VT == MVT::i16)
58745 return std::make_pair(0U, &X86::VK16RegClass);
58746 }
58747 if (Subtarget.hasBWI()) {
58748 if (VT == MVT::v32i1 || VT == MVT::i32)
58749 return std::make_pair(0U, &X86::VK32RegClass);
58750 if (VT == MVT::v64i1 || VT == MVT::i64)
58751 return std::make_pair(0U, &X86::VK64RegClass);
58752 }
58753 break;
58754 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
58755 if (Subtarget.is64Bit()) {
58756 if (VT == MVT::i8 || VT == MVT::i1)
58757 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58758 ? &X86::GR8RegClass
58759 : &X86::GR8_NOREX2RegClass);
58760 if (VT == MVT::i16)
58761 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58762 ? &X86::GR16RegClass
58763 : &X86::GR16_NOREX2RegClass);
58764 if (VT == MVT::i32 || VT == MVT::f32)
58765 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58766 ? &X86::GR32RegClass
58767 : &X86::GR32_NOREX2RegClass);
58768 if (VT != MVT::f80 && !VT.isVector())
58769 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58770 ? &X86::GR64RegClass
58771 : &X86::GR64_NOREX2RegClass);
58772 break;
58773 }
58774 [[fallthrough]];
58775 // 32-bit fallthrough
58776 case 'Q': // Q_REGS
58777 if (VT == MVT::i8 || VT == MVT::i1)
58778 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
58779 if (VT == MVT::i16)
58780 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
58781 if (VT == MVT::i32 || VT == MVT::f32 ||
58782 (!VT.isVector() && !Subtarget.is64Bit()))
58783 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
58784 if (VT != MVT::f80 && !VT.isVector())
58785 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
58786 break;
58787 case 'r': // GENERAL_REGS
58788 case 'l': // INDEX_REGS
58789 if (VT == MVT::i8 || VT == MVT::i1)
58790 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58791 ? &X86::GR8RegClass
58792 : &X86::GR8_NOREX2RegClass);
58793 if (VT == MVT::i16)
58794 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58795 ? &X86::GR16RegClass
58796 : &X86::GR16_NOREX2RegClass);
58797 if (VT == MVT::i32 || VT == MVT::f32 ||
58798 (!VT.isVector() && !Subtarget.is64Bit()))
58799 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58800 ? &X86::GR32RegClass
58801 : &X86::GR32_NOREX2RegClass);
58802 if (VT != MVT::f80 && !VT.isVector())
58803 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58804 ? &X86::GR64RegClass
58805 : &X86::GR64_NOREX2RegClass);
58806 break;
58807 case 'R': // LEGACY_REGS
58808 if (VT == MVT::i8 || VT == MVT::i1)
58809 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
58810 if (VT == MVT::i16)
58811 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
58812 if (VT == MVT::i32 || VT == MVT::f32 ||
58813 (!VT.isVector() && !Subtarget.is64Bit()))
58814 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
58815 if (VT != MVT::f80 && !VT.isVector())
58816 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
58817 break;
58818 case 'f': // FP Stack registers.
58819 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
58820 // value to the correct fpstack register class.
58821 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
58822 return std::make_pair(0U, &X86::RFP32RegClass);
58823 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
58824 return std::make_pair(0U, &X86::RFP64RegClass);
58825 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
58826 return std::make_pair(0U, &X86::RFP80RegClass);
58827 break;
58828 case 'y': // MMX_REGS if MMX allowed.
58829 if (!Subtarget.hasMMX()) break;
58830 return std::make_pair(0U, &X86::VR64RegClass);
58831 case 'v':
58832 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
58833 if (!Subtarget.hasSSE1()) break;
58834 bool VConstraint = (Constraint[0] == 'v');
58835
58836 switch (VT.SimpleTy) {
58837 default: break;
58838 // Scalar SSE types.
58839 case MVT::f16:
58840 if (VConstraint && Subtarget.hasFP16())
58841 return std::make_pair(0U, &X86::FR16XRegClass);
58842 break;
58843 case MVT::f32:
58844 case MVT::i32:
58845 if (VConstraint && Subtarget.hasVLX())
58846 return std::make_pair(0U, &X86::FR32XRegClass);
58847 return std::make_pair(0U, &X86::FR32RegClass);
58848 case MVT::f64:
58849 case MVT::i64:
58850 if (VConstraint && Subtarget.hasVLX())
58851 return std::make_pair(0U, &X86::FR64XRegClass);
58852 return std::make_pair(0U, &X86::FR64RegClass);
58853 case MVT::i128:
58854 if (Subtarget.is64Bit()) {
58855 if (VConstraint && Subtarget.hasVLX())
58856 return std::make_pair(0U, &X86::VR128XRegClass);
58857 return std::make_pair(0U, &X86::VR128RegClass);
58858 }
58859 break;
58860 // Vector types and fp128.
58861 case MVT::v8f16:
58862 if (!Subtarget.hasFP16())
58863 break;
58864 if (VConstraint)
58865 return std::make_pair(0U, &X86::VR128XRegClass);
58866 return std::make_pair(0U, &X86::VR128RegClass);
58867 case MVT::v8bf16:
58868 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58869 break;
58870 if (VConstraint)
58871 return std::make_pair(0U, &X86::VR128XRegClass);
58872 return std::make_pair(0U, &X86::VR128RegClass);
58873 case MVT::f128:
58874 case MVT::v16i8:
58875 case MVT::v8i16:
58876 case MVT::v4i32:
58877 case MVT::v2i64:
58878 case MVT::v4f32:
58879 case MVT::v2f64:
58880 if (VConstraint && Subtarget.hasVLX())
58881 return std::make_pair(0U, &X86::VR128XRegClass);
58882 return std::make_pair(0U, &X86::VR128RegClass);
58883 // AVX types.
58884 case MVT::v16f16:
58885 if (!Subtarget.hasFP16())
58886 break;
58887 if (VConstraint)
58888 return std::make_pair(0U, &X86::VR256XRegClass);
58889 return std::make_pair(0U, &X86::VR256RegClass);
58890 case MVT::v16bf16:
58891 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58892 break;
58893 if (VConstraint)
58894 return std::make_pair(0U, &X86::VR256XRegClass);
58895 return std::make_pair(0U, &X86::VR256RegClass);
58896 case MVT::v32i8:
58897 case MVT::v16i16:
58898 case MVT::v8i32:
58899 case MVT::v4i64:
58900 case MVT::v8f32:
58901 case MVT::v4f64:
58902 if (VConstraint && Subtarget.hasVLX())
58903 return std::make_pair(0U, &X86::VR256XRegClass);
58904 if (Subtarget.hasAVX())
58905 return std::make_pair(0U, &X86::VR256RegClass);
58906 break;
58907 case MVT::v32f16:
58908 if (!Subtarget.hasFP16())
58909 break;
58910 if (VConstraint)
58911 return std::make_pair(0U, &X86::VR512RegClass);
58912 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58913 case MVT::v32bf16:
58914 if (!Subtarget.hasBF16())
58915 break;
58916 if (VConstraint)
58917 return std::make_pair(0U, &X86::VR512RegClass);
58918 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58919 case MVT::v64i8:
58920 case MVT::v32i16:
58921 case MVT::v8f64:
58922 case MVT::v16f32:
58923 case MVT::v16i32:
58924 case MVT::v8i64:
58925 if (!Subtarget.hasAVX512()) break;
58926 if (VConstraint)
58927 return std::make_pair(0U, &X86::VR512RegClass);
58928 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58929 }
58930 break;
58931 }
58932 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
58933 switch (Constraint[1]) {
58934 default:
58935 break;
58936 case 'i':
58937 case 't':
58938 case '2':
58939 return getRegForInlineAsmConstraint(TRI, "x", VT);
58940 case 'm':
58941 if (!Subtarget.hasMMX()) break;
58942 return std::make_pair(0U, &X86::VR64RegClass);
58943 case 'z':
58944 if (!Subtarget.hasSSE1()) break;
58945 switch (VT.SimpleTy) {
58946 default: break;
58947 // Scalar SSE types.
58948 case MVT::f16:
58949 if (!Subtarget.hasFP16())
58950 break;
58951 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
58952 case MVT::f32:
58953 case MVT::i32:
58954 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
58955 case MVT::f64:
58956 case MVT::i64:
58957 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
58958 case MVT::v8f16:
58959 if (!Subtarget.hasFP16())
58960 break;
58961 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58962 case MVT::v8bf16:
58963 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58964 break;
58965 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58966 case MVT::f128:
58967 case MVT::v16i8:
58968 case MVT::v8i16:
58969 case MVT::v4i32:
58970 case MVT::v2i64:
58971 case MVT::v4f32:
58972 case MVT::v2f64:
58973 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58974 // AVX types.
58975 case MVT::v16f16:
58976 if (!Subtarget.hasFP16())
58977 break;
58978 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58979 case MVT::v16bf16:
58980 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58981 break;
58982 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58983 case MVT::v32i8:
58984 case MVT::v16i16:
58985 case MVT::v8i32:
58986 case MVT::v4i64:
58987 case MVT::v8f32:
58988 case MVT::v4f64:
58989 if (Subtarget.hasAVX())
58990 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58991 break;
58992 case MVT::v32f16:
58993 if (!Subtarget.hasFP16())
58994 break;
58995 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58996 case MVT::v32bf16:
58997 if (!Subtarget.hasBF16())
58998 break;
58999 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
59000 case MVT::v64i8:
59001 case MVT::v32i16:
59002 case MVT::v8f64:
59003 case MVT::v16f32:
59004 case MVT::v16i32:
59005 case MVT::v8i64:
59006 if (Subtarget.hasAVX512())
59007 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
59008 break;
59009 }
59010 break;
59011 case 'k':
59012 // This register class doesn't allocate k0 for masked vector operation.
59013 if (Subtarget.hasAVX512()) {
59014 if (VT == MVT::v1i1 || VT == MVT::i1)
59015 return std::make_pair(0U, &X86::VK1WMRegClass);
59016 if (VT == MVT::v8i1 || VT == MVT::i8)
59017 return std::make_pair(0U, &X86::VK8WMRegClass);
59018 if (VT == MVT::v16i1 || VT == MVT::i16)
59019 return std::make_pair(0U, &X86::VK16WMRegClass);
59020 }
59021 if (Subtarget.hasBWI()) {
59022 if (VT == MVT::v32i1 || VT == MVT::i32)
59023 return std::make_pair(0U, &X86::VK32WMRegClass);
59024 if (VT == MVT::v64i1 || VT == MVT::i64)
59025 return std::make_pair(0U, &X86::VK64WMRegClass);
59026 }
59027 break;
59028 }
59029 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
59030 switch (Constraint[1]) {
59031 default:
59032 break;
59033 case 'r':
59034 if (VT == MVT::i8 || VT == MVT::i1)
59035 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
59036 if (VT == MVT::i16)
59037 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
59038 if (VT == MVT::i32 || VT == MVT::f32)
59039 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
59040 if (VT != MVT::f80 && !VT.isVector())
59041 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
59042 break;
59043 case 'R':
59044 if (VT == MVT::i8 || VT == MVT::i1)
59045 return std::make_pair(0U, &X86::GR8RegClass);
59046 if (VT == MVT::i16)
59047 return std::make_pair(0U, &X86::GR16RegClass);
59048 if (VT == MVT::i32 || VT == MVT::f32)
59049 return std::make_pair(0U, &X86::GR32RegClass);
59050 if (VT != MVT::f80 && !VT.isVector())
59051 return std::make_pair(0U, &X86::GR64RegClass);
59052 break;
59053 }
59054 }
59055
59056 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
59057 return std::make_pair(0U, &X86::GR32RegClass);
59058
59059 // Use the default implementation in TargetLowering to convert the register
59060 // constraint into a member of a register class.
59061 std::pair<Register, const TargetRegisterClass*> Res;
59063
59064 // Not found as a standard register?
59065 if (!Res.second) {
59066 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
59067 // to/from f80.
59068 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
59069 // Map st(0) -> st(7) -> ST0
59070 if (Constraint.size() == 7 && Constraint[0] == '{' &&
59071 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
59072 Constraint[3] == '(' &&
59073 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
59074 Constraint[5] == ')' && Constraint[6] == '}') {
59075 // st(7) is not allocatable and thus not a member of RFP80. Return
59076 // singleton class in cases where we have a reference to it.
59077 if (Constraint[4] == '7')
59078 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
59079 return std::make_pair(X86::FP0 + Constraint[4] - '0',
59080 &X86::RFP80RegClass);
59081 }
59082
59083 // GCC allows "st(0)" to be called just plain "st".
59084 if (StringRef("{st}").equals_insensitive(Constraint))
59085 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
59086 }
59087
59088 // flags -> EFLAGS
59089 if (StringRef("{flags}").equals_insensitive(Constraint))
59090 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
59091
59092 // dirflag -> DF
59093 // Only allow for clobber.
59094 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
59095 VT == MVT::Other)
59096 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
59097
59098 // fpsr -> FPSW
59099 // Only allow for clobber.
59100 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
59101 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
59102
59103 return Res;
59104 }
59105
59106 // Make sure it isn't a register that requires 64-bit mode.
59107 if (!Subtarget.is64Bit() &&
59108 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
59109 TRI->getEncodingValue(Res.first) >= 8) {
59110 // Register requires REX prefix, but we're in 32-bit mode.
59111 return std::make_pair(0, nullptr);
59112 }
59113
59114 // Make sure it isn't a register that requires AVX512.
59115 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
59116 TRI->getEncodingValue(Res.first) & 0x10) {
59117 // Register requires EVEX prefix.
59118 return std::make_pair(0, nullptr);
59119 }
59120
59121 // Otherwise, check to see if this is a register class of the wrong value
59122 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
59123 // turn into {ax},{dx}.
59124 // MVT::Other is used to specify clobber names.
59125 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
59126 return Res; // Correct type already, nothing to do.
59127
59128 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
59129 // return "eax". This should even work for things like getting 64bit integer
59130 // registers when given an f64 type.
59131 const TargetRegisterClass *Class = Res.second;
59132 // The generic code will match the first register class that contains the
59133 // given register. Thus, based on the ordering of the tablegened file,
59134 // the "plain" GR classes might not come first.
59135 // Therefore, use a helper method.
59136 if (isGRClass(*Class)) {
59137 unsigned Size = VT.getSizeInBits();
59138 if (Size == 1) Size = 8;
59139 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
59140 return std::make_pair(0, nullptr);
59141 Register DestReg = getX86SubSuperRegister(Res.first, Size);
59142 if (DestReg.isValid()) {
59143 bool is64Bit = Subtarget.is64Bit();
59144 const TargetRegisterClass *RC =
59145 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
59146 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
59147 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
59148 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
59149 if (Size == 64 && !is64Bit) {
59150 // Model GCC's behavior here and select a fixed pair of 32-bit
59151 // registers.
59152 switch (DestReg) {
59153 case X86::RAX:
59154 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
59155 case X86::RDX:
59156 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
59157 case X86::RCX:
59158 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
59159 case X86::RBX:
59160 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
59161 case X86::RSI:
59162 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
59163 case X86::RDI:
59164 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
59165 case X86::RBP:
59166 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
59167 default:
59168 return std::make_pair(0, nullptr);
59169 }
59170 }
59171 if (RC && RC->contains(DestReg))
59172 return std::make_pair(DestReg, RC);
59173 return Res;
59174 }
59175 // No register found/type mismatch.
59176 return std::make_pair(0, nullptr);
59177 } else if (isFRClass(*Class)) {
59178 // Handle references to XMM physical registers that got mapped into the
59179 // wrong class. This can happen with constraints like {xmm0} where the
59180 // target independent register mapper will just pick the first match it can
59181 // find, ignoring the required type.
59182
59183 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
59184 if (VT == MVT::f16)
59185 Res.second = &X86::FR16XRegClass;
59186 else if (VT == MVT::f32 || VT == MVT::i32)
59187 Res.second = &X86::FR32XRegClass;
59188 else if (VT == MVT::f64 || VT == MVT::i64)
59189 Res.second = &X86::FR64XRegClass;
59190 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
59191 Res.second = &X86::VR128XRegClass;
59192 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
59193 Res.second = &X86::VR256XRegClass;
59194 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
59195 Res.second = &X86::VR512RegClass;
59196 else {
59197 // Type mismatch and not a clobber: Return an error;
59198 Res.first = 0;
59199 Res.second = nullptr;
59200 }
59201 } else if (isVKClass(*Class)) {
59202 if (VT == MVT::v1i1 || VT == MVT::i1)
59203 Res.second = &X86::VK1RegClass;
59204 else if (VT == MVT::v8i1 || VT == MVT::i8)
59205 Res.second = &X86::VK8RegClass;
59206 else if (VT == MVT::v16i1 || VT == MVT::i16)
59207 Res.second = &X86::VK16RegClass;
59208 else if (VT == MVT::v32i1 || VT == MVT::i32)
59209 Res.second = &X86::VK32RegClass;
59210 else if (VT == MVT::v64i1 || VT == MVT::i64)
59211 Res.second = &X86::VK64RegClass;
59212 else {
59213 // Type mismatch and not a clobber: Return an error;
59214 Res.first = 0;
59215 Res.second = nullptr;
59216 }
59217 }
59218
59219 return Res;
59220}
59221
59223 // Integer division on x86 is expensive. However, when aggressively optimizing
59224 // for code size, we prefer to use a div instruction, as it is usually smaller
59225 // than the alternative sequence.
59226 // The exception to this is vector division. Since x86 doesn't have vector
59227 // integer division, leaving the division as-is is a loss even in terms of
59228 // size, because it will have to be scalarized, while the alternative code
59229 // sequence can be performed in vector form.
59230 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
59231 return OptSize && !VT.isVector();
59232}
59233
59234void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
59235 if (!Subtarget.is64Bit())
59236 return;
59237
59238 // Update IsSplitCSR in X86MachineFunctionInfo.
59240 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
59241 AFI->setIsSplitCSR(true);
59242}
59243
59244void X86TargetLowering::insertCopiesSplitCSR(
59245 MachineBasicBlock *Entry,
59246 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
59247 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
59248 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
59249 if (!IStart)
59250 return;
59251
59252 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
59253 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
59254 MachineBasicBlock::iterator MBBI = Entry->begin();
59255 for (const MCPhysReg *I = IStart; *I; ++I) {
59256 const TargetRegisterClass *RC = nullptr;
59257 if (X86::GR64RegClass.contains(*I))
59258 RC = &X86::GR64RegClass;
59259 else
59260 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
59261
59262 Register NewVR = MRI->createVirtualRegister(RC);
59263 // Create copy from CSR to a virtual register.
59264 // FIXME: this currently does not emit CFI pseudo-instructions, it works
59265 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
59266 // nounwind. If we want to generalize this later, we may need to emit
59267 // CFI pseudo-instructions.
59268 assert(
59269 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
59270 "Function should be nounwind in insertCopiesSplitCSR!");
59271 Entry->addLiveIn(*I);
59272 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
59273 .addReg(*I);
59274
59275 // Insert the copy-back instructions right before the terminator.
59276 for (auto *Exit : Exits)
59277 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
59278 TII->get(TargetOpcode::COPY), *I)
59279 .addReg(NewVR);
59280 }
59281}
59282
59284 return Subtarget.is64Bit();
59285}
59286
59290 const TargetInstrInfo *TII) const {
59291 assert(MBBI->isCall() && MBBI->getCFIType() &&
59292 "Invalid call instruction for a KCFI check");
59293
59294 MachineFunction &MF = *MBB.getParent();
59295 // If the call target is a memory operand, unfold it and use R11 for the
59296 // call, so KCFI_CHECK won't have to recompute the address.
59297 switch (MBBI->getOpcode()) {
59298 case X86::CALL64m:
59299 case X86::CALL64m_NT:
59300 case X86::TAILJMPm64:
59301 case X86::TAILJMPm64_REX: {
59304 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
59305 /*UnfoldStore=*/false, NewMIs))
59306 report_fatal_error("Failed to unfold memory operand for a KCFI check");
59307 for (auto *NewMI : NewMIs)
59308 MBBI = MBB.insert(OrigCall, NewMI);
59309 assert(MBBI->isCall() &&
59310 "Unexpected instruction after memory operand unfolding");
59311 if (OrigCall->shouldUpdateCallSiteInfo())
59312 MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
59313 MBBI->setCFIType(MF, OrigCall->getCFIType());
59314 OrigCall->eraseFromParent();
59315 break;
59316 }
59317 default:
59318 break;
59319 }
59320
59321 MachineOperand &Target = MBBI->getOperand(0);
59322 Register TargetReg;
59323 switch (MBBI->getOpcode()) {
59324 case X86::CALL64r:
59325 case X86::CALL64r_NT:
59326 case X86::TAILJMPr64:
59327 case X86::TAILJMPr64_REX:
59328 assert(Target.isReg() && "Unexpected target operand for an indirect call");
59329 Target.setIsRenamable(false);
59330 TargetReg = Target.getReg();
59331 break;
59332 case X86::CALL64pcrel32:
59333 case X86::TAILJMPd64:
59334 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
59335 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
59336 // 64-bit indirect thunk calls.
59337 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
59338 "Unexpected register for an indirect thunk call");
59339 TargetReg = X86::R11;
59340 break;
59341 default:
59342 llvm_unreachable("Unexpected CFI call opcode");
59343 break;
59344 }
59345
59346 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
59347 .addReg(TargetReg)
59348 .addImm(MBBI->getCFIType())
59349 .getInstr();
59350}
59351
59352/// Returns true if stack probing through a function call is requested.
59354 return !getStackProbeSymbolName(MF).empty();
59355}
59356
59357/// Returns true if stack probing through inline assembly is requested.
59359
59360 // No inline stack probe for Windows, they have their own mechanism.
59361 if (Subtarget.isOSWindows() ||
59362 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59363 return false;
59364
59365 // If the function specifically requests inline stack probes, emit them.
59366 if (MF.getFunction().hasFnAttribute("probe-stack"))
59367 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
59368 "inline-asm";
59369
59370 return false;
59371}
59372
59373/// Returns the name of the symbol used to emit stack probes or the empty
59374/// string if not applicable.
59377 // Inline Stack probes disable stack probe call
59378 if (hasInlineStackProbe(MF))
59379 return "";
59380
59381 // If the function specifically requests stack probes, emit them.
59382 if (MF.getFunction().hasFnAttribute("probe-stack"))
59383 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
59384
59385 // Generally, if we aren't on Windows, the platform ABI does not include
59386 // support for stack probes, so don't emit them.
59387 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
59388 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59389 return "";
59390
59391 // We need a stack probe to conform to the Windows ABI. Choose the right
59392 // symbol.
59393 if (Subtarget.is64Bit())
59394 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
59395 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
59396}
59397
59398unsigned
59400 // The default stack probe size is 4096 if the function has no stackprobesize
59401 // attribute.
59402 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
59403 4096);
59404}
59405
59407 if (ML && ML->isInnermost() &&
59408 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
59411}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
static const LLT S1
static const LLT F64
amdgpu AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
Definition: BitTracker.cpp:73
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Looks at all the uses of the given value Returns the Liveness deduced from the uses of this value Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses If the result is MaybeLiveUses might be modified but its content should be ignored(since it might not be complete). DeadArgumentEliminationPass
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
static KnownBits extractBits(unsigned BitWidth, const KnownBits &SrcOpKnown, const KnownBits &OffsetKnown, const KnownBits &WidthKnown)
Hexagon Common GEP
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
static const unsigned MaxDepth
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:512
Live Register Matrix
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
unsigned const TargetRegisterInfo * TRI
#define R2(n)
#define T1
uint64_t High
uint64_t IntrinsicInst * II
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(VerifyEach)
const char LLVMTargetMachineRef TM
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
unsigned OpIndex
static StringRef substr(StringRef Str, uint64_t Len)
This file implements the SmallBitVector class.
This file defines the SmallSet class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
This file describes how to lower LLVM code to machine code.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, unsigned Depth, const SimplifyQuery &Q, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG)
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
@ ShiftBit
@ UndefBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineBitOpWithPACK(SDNode *N, SelectionDAG &DAG)
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static bool isX86CCSigned(unsigned X86CC)
Return true if the condition is an signed comparison operation.
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue V)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISD::P...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Dispatching routine to lower various 128-bit x86 vector shuffles.
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic=false)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as a zero or any extension.
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to map a 128-bit or larger integer comparison to vector instructions before type legalization spl...
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG)
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static bool matchAsm(StringRef S, ArrayRef< const char * > Pieces)
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, SDValue And1_L, SDValue And1_R, const SDLoc &DL, SelectionDAG &DAG)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, unsigned Reg)
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, bool AllowTruncate)
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG)
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isLegalConversion(MVT VT, bool IsSigned, const X86Subtarget &Subtarget)
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 256-bit x86 vector shuffles.
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, bool HasVariableMask, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget)
Try to lower a vector shuffle as a bit shift (shifts in zeros).
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then concatenate the result back.
static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If a vector select has an operand that is -1 or 0, try to simplify the select to a bitwise logic oper...
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, SDValue Root, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to lower a vector shuffle as a byte rotation.
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, const APInt &Zeroable, ArrayRef< int > Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, MachineBasicBlock *BB)
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG)
static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG)
Fold "masked merge" expressions like (m & x) | (~m & y) into the equivalent ((x ^ y) & m) ^ y) patter...
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code? Current x86 isa includes the foll...
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5317
static APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition: APFloat.cpp:5342
void clearSign()
Definition: APFloat.h:1209
opStatus next(bool nextDown)
Definition: APFloat.h:1165
void changeSign()
Definition: APFloat.h:1208
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:988
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:214
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1387
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:429
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:209
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:403
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1500
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1372
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1629
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1366
uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition: APInt.cpp:489
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1472
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:906
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:186
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1310
APInt abs() const
Get the absolute value.
Definition: APInt.h:1753
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:351
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:238
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:360
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:446
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
void setSignBit()
Set the sign bit to 1.
Definition: APInt.h:1320
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1448
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1091
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:189
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:196
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:309
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition: APInt.h:1229
bool eq(const APInt &RHS) const
Equality comparison.
Definition: APInt.h:1059
int32_t exactLogBase2() const
Definition: APInt.h:1741
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1377
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:814
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1598
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:415
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition: APInt.h:1587
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1557
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:620
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:199
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1491
void flipAllBits()
Toggle every bit to its opposite value.
Definition: APInt.h:1414
unsigned countl_one() const
Count the number of leading one bits.
Definition: APInt.h:1574
void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition: APInt.cpp:368
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition: APInt.h:1397
unsigned logBase2() const
Definition: APInt.h:1719
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1299
bool getBoolValue() const
Convert APInt to a boolean value.
Definition: APInt.h:451
bool isMask(unsigned numBits) const
Definition: APInt.h:468
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:385
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:314
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1130
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1347
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:853
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1237
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:420
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:286
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:321
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:276
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:180
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1369
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:412
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:369
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:266
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:219
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:838
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:831
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1615
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1201
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition: APInt.h:379
APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition: APInt.cpp:942
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:647
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:644
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:809
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:708
@ Add
*p = old + v
Definition: Instructions.h:712
@ FAdd
*p = old + v
Definition: Instructions.h:733
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:726
@ Or
*p = old | v
Definition: Instructions.h:720
@ Sub
*p = old - v
Definition: Instructions.h:714
@ And
*p = old & v
Definition: Instructions.h:716
@ Xor
*p = old ^ v
Definition: Instructions.h:722
@ FSub
*p = old - v
Definition: Instructions.h:736
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:748
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:724
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:730
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:744
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:728
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:740
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:752
@ Nand
*p = ~(old & v)
Definition: Instructions.h:718
Value * getPointerOperand()
Definition: Instructions.h:852
BinOp getOperation() const
Definition: Instructions.h:787
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:843
Value * getValOperand()
Definition: Instructions.h:856
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:829
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:391
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
The address of a basic block.
Definition: Constants.h:890
A "pseudo-class" with methods for operating on BUILD_VECTORs.
bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
Value * getCalledOperand() const
Definition: InstrTypes.h:1458
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:847
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1292
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition: Constants.cpp:2954
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1575
bool isMachineConstantPoolEntry() const
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1399
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:400
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:432
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Tagged union holding either a T or a Error.
Definition: Error.h:481
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:126
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:698
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:745
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:757
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:695
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition: Function.h:868
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1963
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:350
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:719
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:915
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition: GlobalValue.h:567
bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition: Globals.cpp:399
ThreadLocalMode getThreadLocalMode() const
Definition: GlobalValue.h:271
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:121
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
Class to represent integer types.
Definition: DerivedTypes.h:40
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:174
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:239
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:793
MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition: MCContext.cpp:242
MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition: MCContext.cpp:247
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:230
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
bool isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
unsigned succ_size() const
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
succ_reverse_iterator succ_rbegin()
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
MachineModuleInfo & getMMI() const
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void moveCallSiteInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
const Module * getModule() const
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
bool readMem() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:333
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
void setFlags(SDNodeFlags NewFlags)
op_iterator op_end() const
op_iterator op_begin() const
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Help to insert SDNodeFlags automatically in transforming.
Definition: SelectionDAG.h:364
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition: SelectionDAG.h:936
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:734
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:966
SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:488
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:492
bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:451
SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:744
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:840
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:486
SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:673
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:487
std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:785
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:688
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:780
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:811
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:857
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
std::optional< uint64_t > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVMContext * getContext() const
Definition: SelectionDAG.h:499
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:751
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:568
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition: SelectionDAG.h:920
SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
iterator erase(const_iterator CI)
Definition: SmallVector.h:750
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:556
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:250
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
size_t size_type
Definition: StringRef.h:56
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:262
static constexpr size_t npos
Definition: StringRef.h:52
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition: StringRef.h:163
size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Definition: StringRef.cpp:251
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
Information about stack frame layout on the target.
virtual bool hasFP(const MachineFunction &MF) const =0
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:667
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:558
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:404
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
uint64_t getArrayNumElements() const
bool isX86_MMXTy() const
Return true if this is X86 MMX.
Definition: Type.h:201
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1833
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
use_iterator use_begin()
Definition: Value.h:360
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
Type * getElementType() const
Definition: DerivedTypes.h:436
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
unsigned getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const
bool hasBasePointer(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getDarwinTLSCallPreservedMask() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
Definition: X86Subtarget.h:236
bool hasAnyFMA() const
Definition: X86Subtarget.h:203
bool isOSWindows() const
Definition: X86Subtarget.h:322
bool isTargetMachO() const
Definition: X86Subtarget.h:288
bool useIndirectThunkBranches() const
Definition: X86Subtarget.h:221
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool isPICStyleGOT() const
Definition: X86Subtarget.h:328
bool hasSSE42() const
Definition: X86Subtarget.h:198
const X86TargetLowering * getTargetLowering() const override
Definition: X86Subtarget.h:118
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition: X86Subtarget.h:276
bool canUseCMOV() const
Definition: X86Subtarget.h:192
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:331
bool isTargetWindowsMSVC() const
Definition: X86Subtarget.h:300
bool canUseCMPXCHG8B() const
Definition: X86Subtarget.h:185
bool isTargetDarwin() const
Definition: X86Subtarget.h:280
bool isTargetWin64() const
Definition: X86Subtarget.h:324
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition: X86Subtarget.h:178
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:278
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:122
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:337
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool canExtendTo512DQ() const
Definition: X86Subtarget.h:232
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool isTargetELF() const
Definition: X86Subtarget.h:286
bool hasSSEPrefetch() const
Definition: X86Subtarget.h:209
bool canUseCMPXCHG16B() const
Definition: X86Subtarget.h:186
unsigned char classifyGlobalReference(const GlobalValue *GV, const Module &M) const
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasInt256() const
Definition: X86Subtarget.h:202
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:329
bool isTargetCygMing() const
Definition: X86Subtarget.h:320
unsigned char classifyLocalReference(const GlobalValue *GV) const
Classify a global variable reference for the current subtarget according to how we should reference i...
unsigned char classifyBlockAddressReference() const
Classify a blockaddress reference for the current subtarget according to how we should reference it i...
bool isTargetPS() const
Definition: X86Subtarget.h:284
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:132
bool hasAVX() const
Definition: X86Subtarget.h:199
bool isTargetWindowsGNU() const
Definition: X86Subtarget.h:312
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool isTargetWindowsItanium() const
Definition: X86Subtarget.h:316
bool isTargetNaCl64() const
Definition: X86Subtarget.h:296
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:124
bool useBWIRegs() const
Definition: X86Subtarget.h:262
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const
Classify a global function reference for the current subtarget.
bool hasAVX2() const
Definition: X86Subtarget.h:200
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool isVectorShiftByScalarCheap(Type *Ty) const override
This is used to enable splatted operand transforms for vector shifts and vector funnel shifts.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Return true if sinking I's operands to the same basic block as I is profitable, e....
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y —> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define INT64_MIN
Definition: DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
@ X86_ThisCall
Similar to X86_StdCall.
Definition: CallingConv.h:122
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition: CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition: CallingConv.h:103
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:778
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1167
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1163
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:751
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:490
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:153
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:511
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1310
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:573
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:742
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1196
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1312
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1282
@ STRICT_FCEIL
Definition: ISDOpcodes.h:440
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1313
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition: ISDOpcodes.h:130
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1043
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:501
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1072
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:811
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:157
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1295
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:450
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:818
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:557
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:716
@ MEMBARRIER
MEMBARRIER - Compiler barrier only; generate a no-op.
Definition: ISDOpcodes.h:1269
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1274
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition: ISDOpcodes.h:848
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:491
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:941
@ STRICT_FLOG2
Definition: ISDOpcodes.h:435
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1308
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:931
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1309
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1240
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:974
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:418
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1451
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ EH_LABEL
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:1143
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition: ISDOpcodes.h:141
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:913
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:802
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:684
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:464
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:634
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:107
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1088
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:750
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1262
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1029
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:786
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:958
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1118
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1311
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1097
@ GC_TRANSITION_START
GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the beginning and end of GC transition s...
Definition: ISDOpcodes.h:1354
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:755
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1278
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:641
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1192
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:444
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:908
@ STRICT_FP_TO_FP16
Definition: ISDOpcodes.h:944
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ STRICT_FP16_TO_FP
Definition: ISDOpcodes.h:943
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:733
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:614
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1306
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:587
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1019
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:449
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:438
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:808
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1252
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:884
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:439
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:770
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1314
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:120
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1006
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1256
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1082
@ ConstantPool
Definition: ISDOpcodes.h:82
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:837
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:826
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:916
@ STRICT_FROUND
Definition: ISDOpcodes.h:442
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:764
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:463
@ STRICT_BF16_TO_FP
Definition: ISDOpcodes.h:952
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:441
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:443
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:950
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1304
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:457
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:479
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:456
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1025
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1305
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:864
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1223
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:164
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:484
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1249
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ GET_FPENV_MEM
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1048
@ STRICT_FP_TO_BF16
Definition: ISDOpcodes.h:953
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:679
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:407
@ STRICT_FLOG10
Definition: ISDOpcodes.h:434
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_FEXP2
Definition: ISDOpcodes.h:432
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1303
@ ExternalSymbol
Definition: ISDOpcodes.h:83
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:979
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:897
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:859
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:935
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:437
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:883
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:147
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:814
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1187
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1111
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:791
@ GC_TRANSITION_END
Definition: ISDOpcodes.h:1355
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:436
@ SET_FPENV_MEM
Sets the current floating point environment.
Definition: ISDOpcodes.h:1053
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1246
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:529
bool isExtVecInRegOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1650
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false)
Hook for matching ConstantSDNode predicate.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1645
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1466
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition: ISDOpcodes.h:1632
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1607
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1574
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1554
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition: ISDOpcodes.h:1613
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1513
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition: PatternMatch.h:664
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:972
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate, true > m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
@ GeneralDynamic
Definition: CodeGen.h:46
@ X86
Windows x64, Windows Itanium (IA-64)
@ PTR32_UPTR
Definition: X86.h:213
@ FS
Definition: X86.h:210
@ PTR64
Definition: X86.h:214
@ PTR32_SPTR
Definition: X86.h:212
@ GS
Definition: X86.h:209
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:411
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition: X86BaseInfo.h:391
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: X86BaseInfo.h:488
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition: X86BaseInfo.h:450
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:432
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:456
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition: X86BaseInfo.h:438
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition: X86BaseInfo.h:476
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:403
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:363
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition: X86BaseInfo.h:472
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition: X86BaseInfo.h:460
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:425
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition: X86BaseInfo.h:480
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:444
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:419
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:387
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FCMP
X86 strict FP compare instructions.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
@ AddrNumOperands
Definition: X86BaseInfo.h:36
@ TO_NEAREST_INT
Definition: X86BaseInfo.h:42
@ CUR_DIRECTION
Definition: X86BaseInfo.h:46
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
std::optional< const char * > toString(const std::optional< DWARFFormValue > &V)
Take an optional DWARFFormValue and try to extract a string value from it.
constexpr double e
Definition: MathExtras.h:47
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition: X86InstrInfo.h:121
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
@ Length
Definition: DWP.cpp:480
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition: X86InstrInfo.h:139
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition: Utils.cpp:1546
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:359
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
AddressSpace
Definition: NVPTXBaseInfo.h:21
@ SjLj
setjmp/longjmp based exceptions
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, unsigned Reg)
Replace the address used in the instruction with the direct memory reference.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition: STLExtras.h:2013
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1528
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1768
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:346
bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
unsigned M1(unsigned Val)
Definition: VE.h:376
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, unsigned Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
@ SM_SentinelUndef
@ SM_SentinelZero
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition: DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1954
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Add
Sum of integers.
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ TRUNCATE_TO_REG
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ FIXUPIMM_MASKZ
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition: STLExtras.h:1607
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:219
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
#define EQ(a, b)
Definition: regexec.c:112
This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and the bit indexes (Mask) nee...
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:276
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:250
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:254
static const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition: APFloat.cpp:294
static const fltSemantics & IEEEquad() LLVM_READNONE
Definition: APFloat.cpp:278
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:323
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:277
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:274
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:275
opStatus
IEEE-754R 7: Default exception handling.
Definition: APFloat.h:266
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:274
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:213
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
static KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
Definition: KnownBits.cpp:753
static std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition: KnownBits.cpp:488
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition: KnownBits.h:175
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:97
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:76
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:231
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:62
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:150
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition: KnownBits.h:278
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition: KnownBits.h:82
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:161
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:50
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:70
static KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition: KnownBits.cpp:228
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:214
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:285
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:300
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:169
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition: KnownBits.h:185
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:237
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:134
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:51
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:94
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition: KnownBits.h:88
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:797
static std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition: KnownBits.cpp:526
bool isAllOnes() const
Returns true if value is all one bits.
Definition: KnownBits.h:79
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:56
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
MVT ConstraintVT
The ValueType for the operand value.
std::string ConstraintCode
This contains the actual string for the code, like "m".
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
X86AddressMode - This struct holds a generalized full x86 address mode.